library(tidyverse)
library(readxl)
library(ggforce)
library(concaveman)
library(knitr)
library(olsrr)
library(ranger)
library(Metrics)
library(mgcv)
library(caret)
library(patchwork)
library(glmnet)
library(broom)
library(RColorBrewer)

options(scipen = 999)
set.seed(3630)
# Strike Zone GG Object
geom_zone <- function(top = 11/3, bottom = 3/2, linecolor = "black"){
  geom_rect(xmin = -.7083, xmax = .7083, ymin = bottom, ymax = top,
            alpha = 0, color = linecolor, linewidth = 0.75)
}

# c(0, 0, -.25, -.5, -.25))

# Home Plate GG Object
geom_plate <- function(pov = "pitcher"){
  df <- case_when(
    pov == "pitcher" ~ 
      data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, .25, .5, .25)),
    pov == "catcher" ~ 
      data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, -.25, -.5, -.25))
  )
  
  g <- geom_polygon(data = df, aes(x = x, y = y), fill = "white", color = "black", linewidth = 1.25)
  g
}

# Barrel Function
is.barrel <- function(LA, EV){
  upper <- 1.11*EV - 78.89
  lower <- -EV + 124
  outcome <- (LA >= lower) & (LA <= upper) & (EV >= 98) & (LA >= 8) & (LA <= 50)
  outcome <- replace_na(outcome, FALSE)
  outcome
}

# Normal Name Changer
swap_names <- function(name) {
  parts <- strsplit(name, ", ")[[1]]
  if (length(parts) == 2) {
    return(paste(rev(parts), collapse = " "))
  } else {
    return(name)
  }
}
seasonal <- read_csv("CSVs/season_stats.csv")

pitchers <- read_csv("CSVs/pitcher_comps.csv")

arsenal <- read_csv("CSVs/arsenal.csv")

empty <- read_csv("CSVs/bases_empty.csv")

whiff <- pitchers %>% 
  mutate(whiff = description == "swinging_strike",
         whiff = as.character(whiff)) %>% 
  filter(pitch_type != "NA",
         pitch_type != "PO")

LHP <- read_csv("CSVs/lhp_pitches.csv") %>% 
  select(-...1) %>% 
  filter(!is.na(pitch_type)) %>% 
  mutate(pitch_type = str_replace(pitch_type, "CS", "CU"),
         pitch_name = str_replace(pitch_name, "Slow Curve", "Curveball"),
         pitch_type = str_replace(pitch_type, "KC", "CU"),
         pitch_name = str_replace(pitch_name, "Knuckle Curve", "Curveball"))

RHP <- read_csv("CSVs/rhp_pitches.csv") %>% 
  select(-...1)


whiff_l <- LHP %>% 
  mutate(whiff = description == "swinging_strike",
         whiff = as.character(whiff))

all <- read_csv("CSVs/all_pitches.csv") %>% 
  mutate(distance_sweet = sqrt(((plate_x - 0.85)^2)+((plate_z - 1.55)^2)))



Linear Regression With/Without Interaction Terms


# Model Data (Pitch = Slider, Pitching Hand = Right)
model_data <- arsenal %>% 
  filter(pitch_type == "SL",
         pitch_hand == "R") %>% 
  mutate(ovr_break = sqrt(pitcher_break_x^2 + pitcher_break_z^2))

# Simple Linear Regression
lm_simple <- lm(xwOBA ~ 
                  pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z +
                  pitch_usage + ovr_break,
                data = model_data)


# Overview of all model combinations
model_all <- ols_step_all_possible(lm_simple)

# Backwards Elimination
lm_simple %>% ols_step_backward_p(penter = 0.2)
## 
## 
##                                Stepwise Summary                                
## -----------------------------------------------------------------------------
## Step    Variable        AIC         SBC         SBIC         R2       Adj. R2 
## -----------------------------------------------------------------------------
##  0      Full Model    -879.334    -848.821    -1829.725    0.05444    0.03714 
##  1      spin_rate     -881.055    -854.356    -1831.498    0.05365    0.03927 
## -----------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.232       RMSE                  0.064 
## R-Squared               0.054       MSE                   0.004 
## Adj. R-Squared          0.039       Coef. Var            23.574 
## Pred R-Squared          0.020       AIC                -881.055 
## MAE                     0.049       SBC                -854.356 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.077          5          0.015     3.73    0.0027 
## Residual        1.356        329          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                     Parameter Estimates                                      
## --------------------------------------------------------------------------------------------
##           model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## --------------------------------------------------------------------------------------------
##     (Intercept)     0.888         0.181                  4.899    0.000     0.531     1.244 
##     pitch_speed    -0.006         0.002       -0.259    -3.242    0.001    -0.009    -0.002 
## pitcher_break_x     0.007         0.004        0.399     1.828    0.068     0.000     0.014 
## pitcher_break_z     0.032         0.017        2.198     1.814    0.071    -0.003     0.066 
##     pitch_usage     0.000         0.000       -0.105    -1.922    0.055    -0.001     0.000 
##       ovr_break    -0.035         0.018       -2.597    -1.962    0.051    -0.070     0.000 
## --------------------------------------------------------------------------------------------
# Stepwise Selection
lm_simple %>% ols_step_both_p(prem = 0.15, pent = 0.15)
## 
## 
##                                   Stepwise Summary                                  
## ----------------------------------------------------------------------------------
## Step    Variable             AIC         SBC         SBIC         R2       Adj. R2 
## ----------------------------------------------------------------------------------
##  0      Base Model         -872.581    -864.953    -1823.341    0.00000    0.00000 
##  1      pitch_usage (+)    -875.048    -863.605    -1825.823    0.01324    0.01028 
##  2      pitch_speed (+)    -877.136    -861.879    -1827.881    0.02521    0.01934 
##  3      ovr_break (+)      -881.609    -862.538    -1832.217    0.04387    0.03520 
## ----------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.209       RMSE                  0.064 
## R-Squared               0.044       MSE                   0.004 
## Adj. R-Squared          0.035       Coef. Var            23.624 
## Pred R-Squared          0.021       AIC                -881.609 
## MAE                     0.050       SBC                -862.538 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.063          3          0.021    5.062    0.0019 
## Residual        1.370        331          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)     0.861         0.176                  4.887    0.000     0.514     1.207 
## pitch_usage    -0.001         0.000       -0.121    -2.249    0.025    -0.001     0.000 
## pitch_speed    -0.006         0.002       -0.246    -3.236    0.001    -0.009    -0.002 
##   ovr_break    -0.003         0.001       -0.193    -2.541    0.012    -0.005    -0.001 
## ----------------------------------------------------------------------------------------
# New Model
lm1 <- lm(xwOBA ~ 
            ovr_break + pitch_usage + pitch_speed,
          data = model_data)

# Interaction Linear Regression
lm_interact <- lm(xwOBA ~ 
                    pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z + pitch_usage + 
                    ovr_break + 
                    pitch_speed*spin_rate + pitch_speed*pitch_usage + pitch_speed*ovr_break +
                    pitch_speed*pitcher_break_x + pitch_speed*pitcher_break_z + 
                    spin_rate*ovr_break + spin_rate*pitcher_break_x + spin_rate*pitcher_break_z +
                    pitch_usage*ovr_break + pitch_usage*spin_rate,
                  data = model_data)


# model_interact_all <- ols_step_all_possible(lm_interact)

# Stepwise Selection
lm_interact %>% ols_step_both_p(pent = 0.15, prem = 0.05)
## 
## 
##                                           Stepwise Summary                                           
## ---------------------------------------------------------------------------------------------------
## Step    Variable                             AIC         SBC         SBIC         R2       Adj. R2  
## ---------------------------------------------------------------------------------------------------
##  0      Base Model                         -872.581    -864.953    -1826.673    0.00000     0.00000 
##  1      pitch_speed:pitch_usage (+)        -875.879    -864.437    -1831.752    0.01569     0.01273 
##  2      pitcher_break_z (+)                -874.227    -858.970    -1831.890    0.01671     0.01079 
##  3      pitch_speed:pitch_usage (-)        -870.698    -859.256    -1826.584    0.00035    -0.00265 
##  4      pitch_speed (+)                    -877.840    -862.584    -1835.493    0.02726     0.02140 
##  5      pitch_usage (+)                    -881.134    -862.063    -1840.555    0.04251     0.03383 
##  6      pitch_usage:ovr_break (+)          -882.750    -859.865    -1843.940    0.05279     0.04131 
##  7      ovr_break (+)                      -882.634    -855.935    -1845.597    0.05810     0.04379 
##  8      pitcher_break_z (-)                -884.460    -861.575    -1845.642    0.05761     0.04619 
##  9      spin_rate (+)                      -882.608    -855.909    -1845.571    0.05803     0.04371 
##  10     ovr_break (-)                      -874.597    -851.713    -1835.823    0.02946     0.01769 
##  11     pitch_speed:ovr_break (+)          -882.165    -855.466    -1845.130    0.05678     0.04245 
##  12     pitch_usage:ovr_break (-)          -879.449    -856.564    -1840.653    0.04341     0.03181 
##  13     pitcher_break_x (+)                -877.639    -850.940    -1840.627    0.04395     0.02942 
##  14     spin_rate (-)                      -879.454    -856.569    -1840.658    0.04342     0.03183 
##  15     pitch_speed:pitcher_break_z (+)    -880.623    -853.924    -1843.596    0.05243     0.03803 
## ---------------------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.229       RMSE                  0.064 
## R-Squared               0.052       MSE                   0.004 
## Adj. R-Squared          0.038       Coef. Var            23.589 
## Pred R-Squared          0.019       AIC                -880.623 
## MAE                     0.049       SBC                -853.924 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.075          5          0.015    3.641    0.0032 
## Residual        1.358        329          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                           Parameter Estimates                                            
## --------------------------------------------------------------------------------------------------------
##                       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## --------------------------------------------------------------------------------------------------------
##                 (Intercept)     0.729         0.155                  4.705    0.000     0.424     1.034 
##                 pitch_speed    -0.004         0.002       -0.179    -2.577    0.010    -0.007    -0.001 
##                 pitch_usage    -0.001         0.000       -0.106    -1.946    0.053    -0.001     0.000 
##             pitcher_break_x     0.007         0.004        0.404     1.778    0.076    -0.001     0.014 
##       pitch_speed:ovr_break     0.000         0.000       -2.250    -1.904    0.058    -0.001     0.000 
## pitch_speed:pitcher_break_z     0.000         0.000        1.935     1.768    0.078     0.000     0.001 
## --------------------------------------------------------------------------------------------------------
# Output removes ALL interactions for p < 0.05
# Keeps same as simple LM pitcher_break_z + pitch_speed + pitch_usage

lm_interact %>% ols_step_both_p(pent = 0.15, prem = 0.10)
## 
## 
##                                           Stepwise Summary                                           
## ---------------------------------------------------------------------------------------------------
## Step    Variable                             AIC         SBC         SBIC         R2       Adj. R2  
## ---------------------------------------------------------------------------------------------------
##  0      Base Model                         -872.581    -864.953    -1826.673    0.00000     0.00000 
##  1      pitch_speed:pitch_usage (+)        -875.879    -864.437    -1831.752    0.01569     0.01273 
##  2      pitcher_break_z (+)                -874.227    -858.970    -1831.890    0.01671     0.01079 
##  3      pitch_speed:pitch_usage (-)        -870.698    -859.256    -1826.584    0.00035    -0.00265 
##  4      pitch_speed (+)                    -877.840    -862.584    -1835.493    0.02726     0.02140 
##  5      pitch_usage (+)                    -881.134    -862.063    -1840.555    0.04251     0.03383 
##  6      pitch_usage:ovr_break (+)          -882.750    -859.865    -1843.940    0.05279     0.04131 
##  7      ovr_break (+)                      -882.634    -855.935    -1845.597    0.05810     0.04379 
##  8      pitcher_break_z (-)                -884.460    -861.575    -1845.642    0.05761     0.04619 
##  9      spin_rate (+)                      -882.608    -855.909    -1845.571    0.05803     0.04371 
##  10     ovr_break (-)                      -874.597    -851.713    -1835.823    0.02946     0.01769 
##  11     pitch_speed:ovr_break (+)          -882.165    -855.466    -1845.130    0.05678     0.04245 
##  12     pitch_usage:ovr_break (-)          -879.449    -856.564    -1840.653    0.04341     0.03181 
##  13     pitcher_break_x (+)                -877.639    -850.940    -1840.627    0.04395     0.02942 
##  14     spin_rate (-)                      -879.454    -856.569    -1840.658    0.04342     0.03183 
##  15     pitch_speed:pitcher_break_z (+)    -880.623    -853.924    -1843.596    0.05243     0.03803 
## ---------------------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.229       RMSE                  0.064 
## R-Squared               0.052       MSE                   0.004 
## Adj. R-Squared          0.038       Coef. Var            23.589 
## Pred R-Squared          0.019       AIC                -880.623 
## MAE                     0.049       SBC                -853.924 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.075          5          0.015    3.641    0.0032 
## Residual        1.358        329          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                           Parameter Estimates                                            
## --------------------------------------------------------------------------------------------------------
##                       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## --------------------------------------------------------------------------------------------------------
##                 (Intercept)     0.729         0.155                  4.705    0.000     0.424     1.034 
##                 pitch_speed    -0.004         0.002       -0.179    -2.577    0.010    -0.007    -0.001 
##                 pitch_usage    -0.001         0.000       -0.106    -1.946    0.053    -0.001     0.000 
##             pitcher_break_x     0.007         0.004        0.404     1.778    0.076    -0.001     0.014 
##       pitch_speed:ovr_break     0.000         0.000       -2.250    -1.904    0.058    -0.001     0.000 
## pitch_speed:pitcher_break_z     0.000         0.000        1.935     1.768    0.078     0.000     0.001 
## --------------------------------------------------------------------------------------------------------
# New Model with Interactions (p -value < 0.10 threshhold)
lm2 <- lm(xwOBA ~ 
            pitch_speed + pitch_usage + 
            pitch_speed*ovr_break + pitch_speed*pitcher_break_z,
          data = model_data)


# Trimmed Data
model_results <- model_data %>% 
  select(first_name, last_name, 
         pitch_speed, pitch_usage, pitcher_break_z, ovr_break, xwOBA)

# Comparing Model Predictions
# lm1 = simple
# lm2 = interactions

model_results <- model_results %>% 
  mutate(lm1 = predict(lm1, model_results)) %>% 
  mutate(lm2 = predict(lm2, model_results))

# R and RMSE of Simple Linear Model
with(model_results, cor(xwOBA, lm1))
## [1] 0.2094431
with(model_results, rmse(xwOBA, lm1))
## [1] 0.06394718
# R and RMSE of Interactions Linear Model
with(model_results, cor(xwOBA, lm2))
## [1] 0.2151546
with(model_results, rmse(xwOBA, lm2))
## [1] 0.06386603
model_results %>% 
  select(xwOBA, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = xwOBA, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  theme_classic() +
  labs(title = "Linear Models for RHP",
       x = "Observed",
       y = "Predicted",
       color = "Model")

# Model Data (Pitch = Slider, Pitching Hand = Left)
model_data_l <- arsenal %>% 
  filter(pitch_type == "SL",
         pitch_hand == "L") %>% 
  mutate(ovr_break = sqrt(pitcher_break_x^2 + pitcher_break_z^2))

# Simple Linear Regression
lm_simple_l <- lm(xwOBA ~ 
                    pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z +
                    pitch_usage + ovr_break,
                  data = model_data_l)



# Overview of all model combinations
model_all_l <- ols_step_all_possible(lm_simple_l)

# Backwards Elimination
lm_simple_l %>% ols_step_backward_p(penter = 0.15)
## 
## 
##                                 Stepwise Summary                                
## ------------------------------------------------------------------------------
## Step    Variable         AIC         SBC         SBIC        R2       Adj. R2  
## ------------------------------------------------------------------------------
##  0      Full Model     -205.439    -184.518    -491.033    0.04617    -0.01471 
##  1      pitch_speed    -207.131    -188.825    -492.913    0.04325    -0.00710 
##  2      spin_rate      -208.977    -193.287    -494.917    0.04180     0.00187 
##  3      pitch_usage    -209.989    -196.914    -496.149    0.03238     0.00245 
## ------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                           Model Summary                           
## -----------------------------------------------------------------
## R                        0.180       RMSE                  0.081 
## R-Squared                0.032       MSE                   0.007 
## Adj. R-Squared           0.002       Coef. Var            29.106 
## Pred R-Squared          -0.036       AIC                -209.989 
## MAE                      0.057       SBC                -196.914 
## -----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.022          3          0.007    1.082    0.3605 
## Residual        0.670         97          0.007                    
## Total           0.692        100                                   
## -------------------------------------------------------------------
## 
##                                     Parameter Estimates                                     
## -------------------------------------------------------------------------------------------
##           model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------
##     (Intercept)     0.336         0.068                  4.936    0.000     0.201    0.471 
## pitcher_break_x    -0.013         0.009       -0.653    -1.425    0.157    -0.031    0.005 
## pitcher_break_z    -0.073         0.046       -4.509    -1.604    0.112    -0.164    0.017 
##       ovr_break     0.073         0.046        4.835     1.568    0.120    -0.019    0.165 
## -------------------------------------------------------------------------------------------
# Stepwise Selection
# lm_simple_l %>% ols_step_both_p(prem = 0.15, pent = 0.15)

# New Model
lm1_l <- lm(xwOBA ~ 
              pitcher_break_x + pitcher_break_z +
              ovr_break,
            data = model_data_l)


# Model Data (Pitch = Slider, Pitching Hand = Left)

# Interaction Linear Regression
lm_interact_l <- lm(xwOBA ~ 
                      pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z + pitch_usage + 
                      ovr_break + 
                      pitch_speed*spin_rate + pitch_speed*pitch_usage + pitch_speed*ovr_break +
                      pitch_speed*pitcher_break_x + pitch_speed*pitcher_break_z + 
                      spin_rate*ovr_break + spin_rate*pitcher_break_x + spin_rate*pitcher_break_z +
                      pitch_usage*ovr_break + pitch_usage*spin_rate,
                    data = model_data_l)


# model_interact_all <- ols_step_all_possible(lm_interact)

# Stepwise Selection
# lm_interact_l %>% ols_step_both_p(pent = 0.15, prem = 0.05)

# Output removes ALL interactions for p < 0.05
# Keeps same as simple LM pitcher_break_z + pitch_speed + pitch_usage

# lm_interact_l %>% ols_step_both_p(pent = 0.20, prem = 0.05)

# New Model with Interactions (p -value < 0.10 threshhold)
lm2_l <- lm(xwOBA ~ 
              spin_rate*pitcher_break_z,
            data = model_data_l)


# Trimmed Data
model_results_l <- model_data_l %>% 
  select(first_name, last_name, 
         pitch_speed, pitch_usage, pitcher_break_z, ovr_break, xwOBA)

# Comparing Model Predictions
# lm1 = simple
# lm2 = interactions

model_results_l <- model_results_l %>% 
  mutate(lm1 = predict(lm1, model_results_l)) %>% 
  mutate(lm2 = predict(lm2, model_results_l))

# R and RMSE of Simple Linear Model
with(model_results_l, cor(xwOBA, lm1))
## [1] 0.04569634
with(model_results_l, rmse(xwOBA, lm1))
## [1] 0.08383695
# R and RMSE of Interactions Linear Model
with(model_results_l, cor(xwOBA, lm2))
## [1] 0.02805347
with(model_results_l, rmse(xwOBA, lm2))
## [1] 0.08418886
model_results_l %>% 
  select(xwOBA, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = xwOBA, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  theme_classic() +
  labs(title = "Linear Models for LHP",
       x = "Observed",
       y = "Predicted",
       color = "Model")

# Pitch by Pitch Data (Sliders)
pitches <- pitchers %>% 
  filter(pitch_type == "SL") %>% 
  mutate(pfx_x = pfx_x*12,
         pfx_z = pfx_z*12,
         ovr_break = round(sqrt(pfx_x^2 + pfx_z^2), 3))

# Simple Linear Regression
lm_pitches <- lm(delta_run_exp ~ 
                   release_speed + release_spin_rate + pfx_x + pfx_z +
                   ovr_break + release_extension,
                 data = pitches)



# Stepwise Selection
# lm_pitches %>% ols_step_both_p(prem = 0.25, pent = 0.15)

# New Model
lm1_pitches <- lm(delta_run_exp ~ 
                    release_spin_rate + pfx_x + release_extension,
                  data = pitches)


# Interaction Linear Regression
lm_interact_pitches <- lm(delta_run_exp ~ 
                            release_speed + release_spin_rate + pfx_x + pfx_z +
                            ovr_break + release_extension +
                            release_speed*release_spin_rate + release_speed*pfx_x +
                            release_speed*pfx_z + release_speed*ovr_break +
                            release_speed*release_extension + 
                            release_spin_rate*pfx_x + release_spin_rate*pfx_z + 
                            release_spin_rate*ovr_break + release_spin_rate*release_extension +
                            release_extension*pfx_x + release_extension*pfx_z + release_extension*ovr_break,
                          data = pitches)


# model_interact_all <- ols_step_all_possible(lm_interact)

# Stepwise Selection
lm_interact_pitches %>% ols_step_both_p(pent = 0.15, prem = 0.15)
## 
## 
##                                        Stepwise Summary                                        
## ---------------------------------------------------------------------------------------------
## Step    Variable                         AIC        SBC         SBIC         R2       Adj. R2 
## ---------------------------------------------------------------------------------------------
##  0      Base Model                     -72.271    -59.891    -10299.956    0.00000    0.00000 
##  1      release_spin_rate:pfx_x (+)    -73.773    -55.203    -10301.429    0.00097    0.00069 
##  2      release_extension (+)          -73.562    -48.803    -10301.189    0.00147    0.00091 
##  3      ovr_break (+)                  -73.409    -42.460    -10301.005    0.00198    0.00115 
##  4      release_speed (+)              -71.418    -34.280    -10298.986    0.00198    0.00087 
##  5      ovr_break (-)                  -71.620    -40.671    -10299.220    0.00148    0.00065 
##  6      pfx_x (+)                      -71.805    -34.667    -10299.372    0.00209    0.00098 
##  7      release_extension (-)          -72.690    -41.741    -10300.287    0.00178    0.00095 
##  8      pfx_z (+)                      -70.939    -33.800    -10298.508    0.00185    0.00074 
##  9      pfx_x (-)                      -70.480    -39.531    -10298.082    0.00117    0.00033 
##  10     release_speed:pfx_z (+)        -74.421    -37.282    -10301.980    0.00281    0.00170 
##  11     release_spin_rate:pfx_x (-)    -75.339    -44.391    -10302.931    0.00251    0.00168 
##  12     release_spin_rate (+)          -75.115    -37.976    -10302.672    0.00300    0.00190 
##  13     pfx_z (-)                      -71.481    -40.532    -10299.081    0.00144    0.00061 
## ---------------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                           Model Summary                            
## ------------------------------------------------------------------
## R                        0.038       RMSE                   0.239 
## R-Squared                0.001       MSE                    0.057 
## Adj. R-Squared           0.001       Coef. Var          -3503.793 
## Pred R-Squared          -0.001       AIC                  -71.481 
## MAE                      0.118       SBC                  -40.532 
## ------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares          DF    Mean Square      F        Sig. 
## --------------------------------------------------------------------
## Regression      0.298           3          0.099    1.736    0.1574 
## Residual      206.296        3600          0.057                    
## Total         206.595        3603                                   
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##               model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -----------------------------------------------------------------------------------------------
##         (Intercept)     0.095         0.129                  0.738    0.461    -0.158    0.348 
##       release_speed    -0.002         0.002       -0.022    -1.209    0.227    -0.005    0.001 
##   release_spin_rate     0.000         0.000        0.026     1.325    0.185     0.000    0.000 
## release_speed:pfx_z     0.000         0.000       -0.012    -0.583    0.560     0.000    0.000 
## -----------------------------------------------------------------------------------------------
lm2_pitches <- lm(delta_run_exp ~ 
                    release_spin_rate*pfx_x + release_extension,
                  data = pitches)


model_results_pitches <- pitches %>% 
  select(delta_run_exp, release_speed, release_spin_rate, pfx_x, pfx_z,
         release_extension) %>% 
  mutate(lm1 = predict(lm1_pitches, pitches),
         lm2 = predict(lm2_pitches, pitches))

# R and RMSE of Simple Linear Model

model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.04470764
model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.2391844
# R and RMSE of Interaction Linear Model
model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm2))
## [1] 0.04598308
model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm2))
## [1] 0.2391706
# Graph
model_results_pitches %>% 
  select(delta_run_exp, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = delta_run_exp, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  theme_classic() +
  labs(title = "Linear Models for Pitch-by-Pitch Data",
       subtitle = "Predicting Run Expectancy Added",
       caption = "Pitchers: Scherzer, Taillon, Keller, Manoah, Gallen, Garcia, Gray",
       x = "Observed",
       y = "Predicted",
       color = "Model")

model_results_pitches_2 <- pitches %>% 
  select(delta_run_exp, release_speed, release_spin_rate, pfx_x, pfx_z,
         release_extension, ID) %>% 
  mutate(lm1 = predict(lm1_pitches, pitches),
         lm2 = predict(lm2_pitches, pitches))

# Graph
model_results_pitches_2 %>% 
  select(ID,delta_run_exp, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = delta_run_exp, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  facet_wrap(~ ID, ncol = 1) +
  theme_classic() +
  labs(title = "Linear Models for Pitch-by-Pitch Data",
       subtitle = "Predicting Run Expectancy Added",
       caption = "Pitchers: Scherzer, Taillon, Keller, Manoah, Gallen, Garcia, Gray",
       x = "Observed",
       y = "Predicted",
       color = "Model")

# Correlations

model_results_pitches_2 %>% 
  filter(ID == "Great") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.06192874
model_results_pitches_2 %>% 
  filter(ID == "Decent") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.01215481
model_results_pitches_2 %>% 
  filter(ID == "Bad") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.03652784
# RMSE

model_results_pitches_2 %>% 
  filter(ID == "Great") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.1927567
model_results_pitches_2 %>% 
  filter(ID == "Decent") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.2850569
model_results_pitches_2 %>% 
  filter(ID == "Bad") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.2510364
pitchers %>% 
  # filter(pitch_type == "SL") %>%
  ggplot(aes(x = plate_z)) +
  geom_histogram(binwidth = 0.15, color = "white")

pitchers %>% 
  ggplot(aes(x = delta_run_exp)) +
  geom_histogram(binwidth = 0.15, color = "white")

pitchers %>% 
  filter(pitch_type %in% c("FF", "SL", "CH")) %>%
  mutate(pitch_dist = sqrt(plate_x^2 + (2.5 - plate_z)^2)) %>% 
  ggplot(aes(x = pitch_dist, color = ID)) +
  geom_density() +
  facet_wrap(~ pitch_type,
             ncol = 1)

pitchers %>% 
  filter(pitch_type %in% c("FF", "SL", "CH")) %>%
  mutate(pitch_dist = sqrt(plate_x^2 + (2.5 - plate_z)^2)) %>%
  ggplot(aes(x = pitch_dist, y = delta_run_exp)) +
  geom_point(alpha = 0.15)



Modeling with Difference Variables


# Model?
model <- lm(delta_run_exp ~ dist + speed_change + break_change, 
            data = pitchers)

preds <- pitchers %>% 
  mutate(predicted = predict(model, pitchers)) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

acc <- preds %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

acc

acc +
  facet_wrap(~ pitch_type)

acc +
  facet_wrap(~ ID)

data_ff <- pitchers %>% 
  filter(pitch_type == "FF",
         !is.na(break_change)) %>%  
  filter(pitch_type != "PO")

model_ff <- train(
  delta_run_exp ~ dist + speed_change + break_change + release_speed + pfx_x + pfx_z,
  data = data_ff,
  method = "ranger",
  trControl = trainControl(method = "cv", number = 5))

preds_ff <- cbind(data_ff, predict(model_ff)) %>% 
  as.data.frame() %>% 
  rename(observed = delta_run_exp,
         predicted = "predict(model_ff)") %>% 
  select(ID, zone, pitch_type, observed, predicted)

preds_ff %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  geom_abline(slope = 1, intercept = 0) +
  coord_fixed() +
  labs(title = "Fastball RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_ff$observed, preds_ff$predicted), 4)))

preds_ff %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9625188
data_ff <- pitchers %>% 
  filter(pitch_type == "FF",
         !is.na(break_change))

model_ff <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_ff, mtry = 2)

preds_ff <- data_ff %>% 
  mutate(predicted = predict(model_ff, data_ff)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_ff %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Fastball RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_ff$observed, preds_ff$predicted), 4)))

preds_ff %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9710876
data_si <- pitchers %>% 
  filter(pitch_type == "SI",
         !is.na(break_change))

model_si <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_si, mtry = 2)

preds_si <- data_si %>% 
  mutate(predicted = predict(model_si, data_si)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_si %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Sinker RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_si$observed, preds_si$predicted), 4)))

preds_si %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9591401
data_ch <- pitchers %>% 
  filter(pitch_type == "CH",
         !is.na(break_change))

model_ch <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_ch, mtry = 2)

preds_ch <- data_ch %>% 
  mutate(predicted = predict(model_ch, data_ch)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_ch %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Change-Up RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_ch$observed, preds_ch$predicted), 4)))

preds_ch %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9661838
data_sl <- pitchers %>% 
  filter(pitch_type == "SL",
         !is.na(break_change))

model_sl <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_sl, mtry = 2)

preds_sl <- data_sl %>% 
  mutate(predicted = predict(model_sl, data_sl)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_sl %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Slider RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_sl$observed, preds_sl$predicted), 4)))

preds_sl %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9677164
# 
# Actmodel <- train(delta_run_exp ~ dist + speed_change + break_change,
#                   data = data_sl, method = "ranger", 
#                   trControl = trainControl(method = "cv", number = 10, verboseIter = TRUE), preProcess = c("knnImpute"))
# plot(Actmodel$finalModel$forest)


Logistic Regression Whiff RHP

# Slider Logistic Model
whiff_sl <- whiff %>% 
  filter(pitch_type == "SL") %>% 
  mutate(whiff = str_replace(whiff, "TRUE", "1"),
         whiff = str_replace(whiff, "FALSE", "0"),
         whiff = as.numeric(whiff))

# Original Model
model1 <- glm(whiff ~ release_speed + spin_axis + pfx_x + pfx_z + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + dist,
              data = whiff_sl, family = binomial)

# Reduced Model
model1 <- glm(whiff ~ release_speed + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + dist,
              data = whiff_sl, family = binomial)

summary(model1)
## 
## Call:
## glm(formula = whiff ~ release_speed + plate_x + plate_z + release_spin_rate + 
##     speed_change + break_change + pfx_total + dist, family = binomial, 
##     data = whiff_sl)
## 
## Coefficients:
##                     Estimate Std. Error z value       Pr(>|z|)    
## (Intercept)        9.0550541  3.0643314   2.955        0.00313 ** 
## release_speed     -0.0924999  0.0353807  -2.614        0.00894 ** 
## plate_x            0.5454560  0.0870095   6.269 0.000000000364 ***
## plate_z           -0.2438012  0.0988763  -2.466        0.01367 *  
## release_spin_rate -0.0006138  0.0002186  -2.808        0.00498 ** 
## speed_change       0.1267177  0.0611358   2.073        0.03820 *  
## break_change       1.0446186  0.3722565   2.806        0.00501 ** 
## pfx_total         -0.9888782  0.2453062  -4.031 0.000055493095 ***
## dist              -0.5004963  0.1216086  -4.116 0.000038612157 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3093.3  on 3604  degrees of freedom
## Residual deviance: 3003.3  on 3596  degrees of freedom
##   (6 observations deleted due to missingness)
## AIC: 3021.3
## 
## Number of Fisher Scoring iterations: 5
preds <- whiff_sl %>% 
  mutate(prediction_log = predict(model1, whiff_sl),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

preds %>% 
  ggplot(aes(x = as.character(whiff), y = prediction)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.1, width = 0.1, height = 0)

preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Whiff proportion by predicted whiff value",
       subtitle = "Whiff predictions have a 1% bin width")

preds %>% 
  arrange(desc(prediction)) %>% 
  head(10)
## # A tibble: 10 × 85
##    pitch_type game_date  release_speed release_pos_x release_pos_z player_name 
##    <chr>      <date>             <dbl>         <dbl>         <dbl> <chr>       
##  1 SL         2022-06-03          85.5         -2.02          5.32 Gray, Josiah
##  2 SL         2022-04-13          83.4         -1.79          5.18 Gray, Josiah
##  3 SL         2022-04-13          84.7         -1.75          5.25 Gray, Josiah
##  4 SL         2022-04-13          83.3         -1.71          5.09 Gray, Josiah
##  5 SL         2022-04-08          84.5         -1.54          5.17 Gray, Josiah
##  6 SL         2022-04-13          84.1         -1.74          5.19 Gray, Josiah
##  7 SL         2022-04-08          83.7         -1.61          5.2  Gray, Josiah
##  8 SL         2022-04-08          86.2         -1.84          5.4  Gray, Josiah
##  9 SL         2022-04-26          85.7         -1.79          5.28 Gray, Josiah
## 10 SL         2022-04-08          85.1         -1.69          5.19 Gray, Josiah
## # ℹ 79 more variables: batter <dbl>, pitcher...8 <dbl>, events <chr>,
## #   description <chr>, zone <dbl>, des <chr>, game_type <chr>, stand <chr>,
## #   p_throws <chr>, home_team <chr>, away_team <chr>, type <chr>,
## #   hit_location <dbl>, bb_type <chr>, balls <dbl>, strikes <dbl>,
## #   game_year <dbl>, pfx_x <dbl>, pfx_z <dbl>, plate_x <dbl>, plate_z <dbl>,
## #   on_3b <dbl>, on_2b <dbl>, on_1b <dbl>, outs_when_up <dbl>, inning <dbl>,
## #   inning_topbot <chr>, hc_x <dbl>, hc_y <dbl>, tfs_deprecated <lgl>, …
whiff %>% 
  mutate(count = paste0(balls, "-", strikes)) %>% 
  filter(pitch_type == "SL") %>% 
  ggplot(aes(y = whiff, x = pfx_z*12)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~count) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Movement",
       x = "Induced Vertical Movement (in.)",
       y = "Outcome") +
  NULL

# Sliders
whiff %>% 
  filter(pitch_type =="SL") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(stand), rows = vars(whiff)) +
  theme_bw()

# Fastballs
whiff %>% 
  filter(pitch_type =="FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(stand), rows = vars(whiff)) +
  theme_bw()

# Change-Ups
whiff %>% 
  filter(pitch_type =="CH") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(stand), rows = vars(whiff)) +
  theme_bw()

whiff %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(stand == "R",
         pitch_type == "SL",
         prev_pitch  %in% c("FF", "CH", "SL", "CU"),
         player_name == "Scherzer, Max") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(prev_pitch), rows = vars(whiff)) +
  theme_bw()

whiff %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(stand == "R",
         pitch_type == "FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(player_name), rows = vars(whiff)) +
  theme_bw()

zoned <- whiff %>% 
  mutate(loc_x = round(plate_x*3, 0),
         loc_y = round(plate_z*3, 0))

zoned %>% 
  filter(pitch_type == "FF",
         plate_z > 0 & plate_z < 6,
         plate_x > -1.5 & plate_x < 1.5) %>% 
  summarize(whiff_perc = mean(whiff == "TRUE"),
            pitches = n(),
            .by = c(loc_x, loc_y, player_name)) %>% 
  filter(pitches >= 10) %>% 
  ggplot(aes(x = -loc_x, y = loc_y, fill = whiff_perc)) + 
  geom_tile() +
  scale_fill_gradient(low = "gray", high = "red") +
  facet_wrap(~ player_name) +
  coord_fixed() +
  theme_bw()


Logistic Regression Whiff LHP

Logistic Regression

# Slider Logistic Model
whiff_sl2 <- whiff_l %>% 
  filter(pitch_type == "SL") %>% 
  mutate(whiff = str_replace(whiff, "TRUE", "1"),
         whiff = str_replace(whiff, "FALSE", "0"),
         whiff = as.numeric(whiff))

# Original Model
model2 <- glm(whiff ~ pitch_speed + spin_axis + pfx_x + pfx_z + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + distance,
              data = whiff_sl2, family = binomial)

# Reduced Model
model2 <- glm(whiff ~ pitch_speed + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + distance,
              data = whiff_sl2, family = binomial)

summary(model2)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + plate_x + plate_z + release_spin_rate + 
##     speed_change + break_change + pfx_total + distance, family = binomial, 
##     data = whiff_sl2)
## 
## Coefficients:
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)       -5.0425752  0.9277708  -5.435  0.00000005474968265 ***
## pitch_speed        0.0501462  0.0103594   4.841  0.00000129426814286 ***
## plate_x           -0.3698816  0.0469998  -7.870  0.00000000000000355 ***
## plate_z           -0.4913540  0.0488865 -10.051 < 0.0000000000000002 ***
## release_spin_rate  0.0002894  0.0001245   2.325               0.0201 *  
## speed_change       0.0038569  0.0228735   0.169               0.8661    
## break_change       0.2291186  0.2001940   1.144               0.2524    
## pfx_total         -0.1649991  0.1196077  -1.380               0.1677    
## distance          -0.7165732  0.0642529 -11.152 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11359  on 13558  degrees of freedom
## Residual deviance: 11095  on 13550  degrees of freedom
##   (98 observations deleted due to missingness)
## AIC: 11113
## 
## Number of Fisher Scoring iterations: 5
preds2 <- whiff_sl2 %>% 
  mutate(prediction_log = predict(model2, whiff_sl2),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

preds2 %>% 
  ggplot(aes(x = as.character(whiff), y = prediction)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.1, width = 0.1, height = 0)

preds2 %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Whiff proportion by predicted whiff value",
       subtitle = "Whiff predictions have a 1% bin width")

whiff_l %>% 
  mutate(count = paste0(balls, "-", strikes)) %>% 
  filter(pitch_type == "SL") %>% 
  ggplot(aes(y = whiff, x = pfx_z*12)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~count) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Movement",
       x = "Induced Vertical Movement (in.)",
       y = "Outcome") +
  NULL

# Sliders
whiff_l %>% 
  filter(pitch_type =="SL") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

# Fastballs
whiff_l %>% 
  filter(pitch_type =="FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

# Change-Ups
whiff_l %>% 
  filter(pitch_type =="CH") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

whiff_l %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(hitter == "R",
         pitch_type == "SL",
         prev_pitch  %in% c("FF", "CH", "SL", "CU"),
         player_name == "Fried, Max") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(prev_pitch), rows = vars(whiff)) +
  theme_bw()

whiff_l %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(hitter == "R",
         pitch_type == "FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(player_name), rows = vars(whiff)) +
  theme_bw()

zoned2 <- whiff_l %>% 
  mutate(loc_x = round(plate_x*3, 0),
         loc_y = round(plate_z*3, 0))

zoned2 %>% 
  filter(pitch_type == "FF",
         plate_z > 0 & plate_z < 6,
         plate_x > -1.5 & plate_x < 1.5) %>% 
  summarize(whiff_perc = mean(whiff == "TRUE"),
            pitches = n(),
            .by = c(loc_x, loc_y, player_name)) %>% 
  filter(pitches >= 10) %>% 
  ggplot(aes(x = -loc_x, y = loc_y, fill = whiff_perc)) + 
  geom_tile() +
  scale_fill_gradient(low = "gray", high = "red") +
  facet_wrap(~ player_name) +
  coord_fixed() +
  theme_bw()


Logistic Regression Whiff Model Test (RHP Sliders / Fastballs)

all2 <- all %>% 
  filter(!is.na(pfx_z)) %>% 
  mutate(distance_sweet = sqrt(((plate_x - 0.85)^2)+((plate_z - 1.55)^2))) %>% 
  mutate(zone = case_when(distance < 2 ~ zone,
                          distance >= 2 & plate_z > (sz_top + sz_bot)/2 ~ 16,
                          distance >= 2 & plate_z <= (sz_top + sz_bot)/2 ~ 17)) %>% 
  mutate(ab_id = paste0(game_date, "_", player_id, "_", at_bat_number),
         prev_ab_id = lead(ab_id, 1)) %>% 
  mutate(prev_pitch = ifelse(ab_id == prev_ab_id, lead(pitch_type, 1), NA)) %>% 
  select(-ab_id, -prev_ab_id)

all_ff <- all2 %>%
  filter(pitch_type == "FF") %>% 
  summarize(fb_z = mean(pfx_z),
            fb_x = mean(pfx_x),
            fb_mph = mean(pitch_speed),
            .by = c(game_date, player_id))


all2 <- all2 %>% 
  left_join(all_ff, by = c("game_date" = "game_date",
                                 "player_id" = "player_id")) %>% 
  mutate(pfx_x_diff = (fb_x - pfx_x),
         pfx_z_diff = (fb_z = pfx_z),
         speed_ff_diff = (fb_mph - pitch_speed))

sliders <- all2 %>% 
  filter(p_throws == "R",
         hitter == "R") %>% 
  filter(pitch_type == "SL") %>% 
  mutate(count = paste0(balls, "_", strikes),
         prev_pitch = as.factor(prev_pitch)) %>% 
  mutate(prev_pitch_ff = ifelse(prev_pitch == "FF", 1, 0))
  
fastballs <- all2 %>% 
  filter(p_throws == "R",
         hitter == "R") %>% 
  filter(pitch_type == "FF") %>% 
  mutate(count = paste0(balls, "_", strikes),
         prev_pitch = as.factor(prev_pitch))
# Initial Model
sl_model_all <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              balls + strikes + pfx_total + speed_change + break_change + distance + 
                prev_pitch_ff + pfx_x_diff + pfx_z_diff + speed_ff_diff,
              data = sliders, family = binomial)

# Model Evaluation
# summary(sl_model_all)

sl_model <- glm(whiff ~ pfx_x + pfx_z + plate_x + plate_z + zone +
              balls + pfx_total + speed_change + break_change + distance + 
                prev_pitch_ff + pfx_x_diff + speed_ff_diff,
              data = sliders, family = binomial)

# Model Predictions
sl_preds <- sliders %>% 
  mutate(prediction_log = predict(sl_model, sliders),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

sl_preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(n = n(), whiff_rate = mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = whiff_rate)) +
  geom_point(aes(size = n)) + # size of bin shown on graph
  coord_fixed() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Slider Whiff Model Prediction",
       subtitle = "Whiff proportion by predicted whiff value",
       caption = "Whiff predictions have a 1% bin width")

sl0 + sl1 + sl2

# Initial Model
ff_model_all <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              balls + pfx_total + speed_change + break_change + distance + 
                pfx_x_diff + pfx_z_diff + speed_ff_diff,
              data = fastballs, family = binomial)

# Model Evaluation
# summary(ff_model_all)

ff_model <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_z + zone +
              pfx_total + speed_change + break_change + distance + 
                pfx_x_diff,
              data = fastballs, family = binomial)

summary(ff_model)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_z + 
##     zone + pfx_total + speed_change + break_change + distance + 
##     pfx_x_diff, family = binomial, data = fastballs)
## 
## Coefficients:
##                Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  -10.656244   0.896061 -11.892 < 0.0000000000000002 ***
## pitch_speed    0.076168   0.009312   8.180 0.000000000000000285 ***
## pfx_x         -0.605126   0.163263  -3.706             0.000210 ***
## pfx_z          1.427415   0.394350   3.620             0.000295 ***
## plate_z        1.148325   0.046268  24.819 < 0.0000000000000002 ***
## zone          -0.011277   0.006688  -1.686             0.091785 .  
## pfx_total     -2.056400   0.447796  -4.592 0.000004384582739110 ***
## speed_change   0.159605   0.022717   7.026 0.000000000002127137 ***
## break_change   2.673909   0.209535  12.761 < 0.0000000000000002 ***
## distance      -1.364636   0.077490 -17.610 < 0.0000000000000002 ***
## pfx_x_diff    -2.145582   0.162400 -13.212 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20320  on 31922  degrees of freedom
## Residual deviance: 18490  on 31912  degrees of freedom
##   (33 observations deleted due to missingness)
## AIC: 18512
## 
## Number of Fisher Scoring iterations: 6
# Model Predictions
ff_preds <- fastballs %>% 
  mutate(prediction_log = predict(ff_model, fastballs),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ff_preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  coord_fixed() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Fastball Whiff Model Prediction",
       subtitle = "Whiff proportion by predicted whiff value",
       caption = "Whiff predictions have a 1% bin width")

ff0 + ff1 + ff2

## Sliders
# Model
sl_model_s <- glm(whiff ~ pfx_x + pfx_z + plate_x + plate_z + zone +
              balls + pfx_total + speed_change + break_change + distance + distance_sweet,
              data = sliders, family = binomial)

summary(sl_model_s)
## 
## Call:
## glm(formula = whiff ~ pfx_x + pfx_z + plate_x + plate_z + zone + 
##     balls + pfx_total + speed_change + break_change + distance + 
##     distance_sweet, family = binomial, data = sliders)
## 
## Coefficients:
##                Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)    -1.87727    0.14741 -12.735 < 0.0000000000000002 ***
## pfx_x           0.48620    0.12644   3.845             0.000120 ***
## pfx_z           0.30552    0.08066   3.788             0.000152 ***
## plate_x        -0.70390    0.05572 -12.634 < 0.0000000000000002 ***
## plate_z         0.58543    0.05484  10.675 < 0.0000000000000002 ***
## zone            0.01818    0.01030   1.765             0.077553 .  
## balls           0.08350    0.02181   3.828             0.000129 ***
## pfx_total      -0.74519    0.14550  -5.122          0.000000303 ***
## speed_change    0.04244    0.01909   2.224             0.026180 *  
## break_change    0.51312    0.14775   3.473             0.000515 ***
## distance        0.86608    0.09943   8.710 < 0.0000000000000002 ***
## distance_sweet -1.76648    0.06708 -26.334 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19154  on 22105  degrees of freedom
## Residual deviance: 17771  on 22094  degrees of freedom
## AIC: 17795
## 
## Number of Fisher Scoring iterations: 5
# Model Predictions
sl_preds_s <- sliders %>% 
  mutate(prediction_log = predict(sl_model_s, sliders),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

sl_preds_s %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  coord_fixed() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Slider Whiff Model Prediction (w/ Sweet)")

## Fastball
# Model
ff_model_s <- glm(whiff ~ pitch_speed + pfx_z + plate_x + plate_z + zone +
              strikes + pfx_total + speed_change + break_change + distance,
              data = fastballs, family = binomial)

summary(ff_model_s)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_z + plate_x + plate_z + 
##     zone + strikes + pfx_total + speed_change + break_change + 
##     distance, family = binomial, data = fastballs)
## 
## Coefficients:
##                Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  -12.016000   0.882106 -13.622 < 0.0000000000000002 ***
## pitch_speed    0.086479   0.009195   9.405 < 0.0000000000000002 ***
## pfx_z          1.258882   0.171639   7.334    0.000000000000223 ***
## plate_x        0.069898   0.036546   1.913             0.055799 .  
## plate_z        1.153443   0.046862  24.614 < 0.0000000000000002 ***
## zone          -0.012318   0.006756  -1.823             0.068260 .  
## strikes        0.163423   0.024729   6.608    0.000000000038824 ***
## pfx_total     -1.503226   0.181106  -8.300 < 0.0000000000000002 ***
## speed_change   0.084469   0.023739   3.558             0.000373 ***
## break_change   1.597067   0.188845   8.457 < 0.0000000000000002 ***
## distance      -1.370889   0.077209 -17.756 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20320  on 31922  degrees of freedom
## Residual deviance: 18620  on 31912  degrees of freedom
##   (33 observations deleted due to missingness)
## AIC: 18642
## 
## Number of Fisher Scoring iterations: 6
# Model Predictions
ff_preds_s <- fastballs %>% 
  mutate(prediction_log = predict(ff_model_s, fastballs),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ff_preds_s %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  coord_fixed() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Fastball Whiff Model Prediction (w/ Sweet)")


Simplifying Previous Pitch Variable

rhp <- RHP %>% 
  filter(!is.na(pfx_z)) %>% 
  mutate(distance_sweet = sqrt(((plate_x - 0.85)^2)+((plate_z - 1.55)^2))) %>% 
  mutate(zone = case_when(distance < 2 ~ zone,
                          distance >= 2 & plate_z > (sz_top + sz_bot)/2 ~ 16,
                          distance >= 2 & plate_z <= (sz_top + sz_bot)/2 ~ 17)) %>% 
  mutate(ab_id = paste0(game_date, "_", player_id, "_", at_bat_number),
         prev_ab_id = lead(ab_id, 1)) %>% 
  mutate(prev_pitch = ifelse(ab_id == prev_ab_id, lead(pitch_type, 1), NA)) %>% 
  select(-ab_id, -prev_ab_id) %>% 
  mutate(prev_pitch = case_when(prev_pitch  %in% c("FF", "FC", "SI") ~ "Fastball",
                                prev_pitch %in% c("SL", "CU", "KC", 
                                                  "SV", "ST", "CS") ~ "Breaking Ball",
                                prev_pitch  %in% c("CH", "FS") ~ "Off Speed")) %>% 
  mutate(zone = relevel(as.factor(zone), ref = 5))

rhp <- rhp %>% 
  mutate(prev_pitch = ifelse(is.na(prev_pitch), "None", prev_pitch)) %>% 
  mutate(Count = paste0(balls, "-", strikes)) %>% 
  mutate(Count = as.factor(Count))

sliders2 <- rhp %>% 
  filter(hitter == "R") %>% 
  filter(pitch_type == "SL") %>% 
  mutate(count = paste0(balls, "_", strikes),
         prev_pitch = as.factor(prev_pitch)) %>% 
  mutate(prev_pitch_ff = ifelse(prev_pitch == "FF", 1, 0))
  
fastballs2 <- rhp %>% 
  filter(hitter == "R") %>% 
  filter(pitch_type == fb_type) %>% 
  mutate(count = paste0(balls, "_", strikes),
         prev_pitch = as.factor(prev_pitch))


Predicted stat rates by batter

rhp <- rhp %>% 
  mutate(is_barrel = is.barrel(LA = launch_angle, EV = launch_speed)) %>% 
  mutate(is_strike = ifelse(description  %in% c("called_strike", "swinging_strike",
                                           "foul_tip", "bunt_foul_tip", "foul_bunt",
                                           "swinging_strike_blocked", "missed_bunt"),
                       1, 0))


# Batter Whiff
batter_stats <- rhp %>% 
  summarize(rate = mean(whiff),
                      pitches = n(),
            .by = c(batter, pitch_type)) %>% 
  mutate(pitch_whiff = weighted.mean(rate, pitches),
         .by = pitch_type) %>% 
  mutate(pred_bwhiff = (pitches / 300)*rate + ((300-pitches)/300)*pitch_whiff) %>% 
  mutate(pred_bwhiff = ifelse(pitches >= 300, rate, pred_bwhiff))


# Merging Whiff Prediction with RHP
rhp <- rhp %>% 
  left_join(select(batter_stats, batter, pitch_type, pred_bwhiff), 
             by = c("batter" = "batter", "pitch_type" = "pitch_type"))

rhp <- rhp %>% 
  mutate(prev_pitch = ifelse(is.na(prev_pitch), "None", prev_pitch)) %>% 
  mutate(Count = paste0(balls, "-", strikes)) %>% 
  mutate(Count = as.factor(Count))


# Barrel
batter_stats2 <- rhp %>% 
  summarize(rate = mean(is_barrel),
                      pitches = n(),
            .by = c(batter, pitch_type)) %>% 
  mutate(pitch_barrel = weighted.mean(rate, pitches),
         .by = pitch_type) %>% 
  mutate(pred_bbarrel = (pitches / 300)*rate + ((300-pitches)/300)*pitch_barrel) %>% 
  mutate(pred_bbarrel = ifelse(pitches >= 300, rate, pred_bbarrel))

rhp <- rhp %>% 
  left_join(select(batter_stats2, batter, pitch_type, pred_bbarrel), 
             by = c("batter" = "batter", "pitch_type" = "pitch_type"))


sliders3 <- rhp %>% 
  filter(hitter == "R") %>% 
  filter(pitch_type == "SL") %>% 
  mutate(count = paste0(balls, "_", strikes),
         prev_pitch = as.factor(prev_pitch)) %>% 
  mutate(prev_pitch_ff = ifelse(prev_pitch == "FF", 1, 0))
  
fastballs3 <- rhp %>% 
  filter(hitter == "R") %>% 
  filter(pitch_type == fb_type) %>% 
  mutate(count = paste0(balls, "_", strikes),
         prev_pitch = as.factor(prev_pitch))


Predicted whiff rate by batter

WHIFF Slider Initial Model

# Initial Model
sl_model_whiff_all <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + prev_pitch +
                speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + release_spin_rate +
                I(plate_x^2) + I(plate_z^2) + pred_bwhiff,
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_whiff_all)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + 
##     release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bwhiff, 
##     family = binomial, data = sliders3)
## 
## Coefficients:
##                         Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)          -1.05572579 130.94356037  -0.008             0.993567    
## pitch_speed           0.02429046   0.01003510   2.421             0.015497 *  
## pfx_x                 0.32528485   0.14288652   2.277             0.022815 *  
## pfx_z                 0.39834309   0.09595890   4.151       0.000033075878 ***
## plate_x               0.80748233   0.12765800   6.325       0.000000000253 ***
## plate_z               2.55594132   0.28344485   9.017 < 0.0000000000000002 ***
## zone1                 0.67646339   0.26178958   2.584             0.009766 ** 
## zone2                 0.67667552   0.18528884   3.652             0.000260 ***
## zone3                 0.02289412   0.20965633   0.109             0.913045    
## zone4                -0.58079680   0.27072960  -2.145             0.031929 *  
## zone6                -0.17432469   0.14483353  -1.204             0.228736    
## zone7                -0.04511564   0.24343685  -0.185             0.852972    
## zone8                 0.22867951   0.14490712   1.578             0.114540    
## zone9                 0.29923544   0.14898638   2.008             0.044593 *  
## zone11                1.21977565   0.30442252   4.007       0.000061533691 ***
## zone12               -0.18944390   0.21032547  -0.901             0.367738    
## zone13                0.49500499   0.21535459   2.299             0.021530 *  
## zone14                0.52889407   0.17237051   3.068             0.002152 ** 
## zone16               -0.18404646   1.05480688  -0.174             0.861485    
## zone17               -0.32604219   0.21788223  -1.496             0.134546    
## count0_1             -9.89952634 130.93984983  -0.076             0.939734    
## count0_2            -10.27089297 130.93985964  -0.078             0.937478    
## count1_0             -9.99869531 130.93982429  -0.076             0.939132    
## count1_1             -9.83202808 130.93985283  -0.075             0.940145    
## count1_2            -10.06882229 130.93985405  -0.077             0.938706    
## count2_0             -9.78812708 130.93991506  -0.075             0.940411    
## count2_1             -9.63211779 130.93987042  -0.074             0.941359    
## count2_2            -10.05250924 130.93985880  -0.077             0.938805    
## count3_0            -10.91935396 130.94192293  -0.083             0.933541    
## count3_1             -9.87521223 130.93999450  -0.075             0.939882    
## count3_2             -9.95773351 130.93988071  -0.076             0.939381    
## pfx_total            -0.64169840   0.15105423  -4.248       0.000021555968 ***
## speed_change          0.06878315   0.02198705   3.128             0.001758 ** 
## break_change          0.48163126   0.15541588   3.099             0.001942 ** 
## distance              0.99372528   0.17076256   5.819       0.000000005908 ***
## prev_pitchFastball   -0.16241984   0.04528214  -3.587             0.000335 ***
## prev_pitchNone      -10.32088648 130.93983515  -0.079             0.937175    
## prev_pitchOff Speed  -0.02126857   0.09929736  -0.214             0.830398    
## speed_fb_diff        -0.07392525   0.01312000  -5.635       0.000000017552 ***
## pfx_x_fb_diff        -0.03049100   0.06537696  -0.466             0.640938    
## pfx_z_fb_diff         0.04868005   0.06507598   0.748             0.454430    
## release_spin_rate     0.00034439   0.00009848   3.497             0.000470 ***
## I(plate_x^2)         -0.89386802   0.07865626 -11.364 < 0.0000000000000002 ***
## I(plate_z^2)         -0.63710529   0.06366125 -10.008 < 0.0000000000000002 ***
## pred_bwhiff          18.01795901   0.94440432  19.079 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19130  on 22074  degrees of freedom
## Residual deviance: 17082  on 22030  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 17172
## 
## Number of Fisher Scoring iterations: 10

WHIFF Slider Refined Model & Predictions

# Refined Model
sl_model_whiff <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              pfx_total + speed_change + break_change + distance + prev_pitch +
                speed_fb_diff + release_spin_rate +
                I(plate_z^2) + pred_bwhiff,
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_whiff)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + speed_fb_diff + release_spin_rate + 
##     I(plate_z^2) + pred_bwhiff, family = binomial, data = sliders3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -8.49113461  0.91541155  -9.276 < 0.0000000000000002 ***
## pitch_speed          0.02323610  0.00959649   2.421             0.015465 *  
## pfx_x                0.31047257  0.13219128   2.349             0.018841 *  
## pfx_z                0.45779612  0.08749780   5.232        0.00000016761 ***
## plate_x             -0.14336419  0.08553803  -1.676             0.093733 .  
## plate_z              0.74628734  0.20871951   3.576             0.000349 ***
## zone1                0.54083270  0.25643055   2.109             0.034938 *  
## zone2                1.03289659  0.18080609   5.713        0.00000001112 ***
## zone3                0.82164815  0.19804066   4.149        0.00003340968 ***
## zone4               -0.92160171  0.26646902  -3.459             0.000543 ***
## zone6                0.45119119  0.13412429   3.364             0.000768 ***
## zone7               -0.18593965  0.23907119  -0.778             0.436711    
## zone8                0.59609886  0.13993617   4.260        0.00002046179 ***
## zone9                1.10379034  0.13131235   8.406 < 0.0000000000000002 ***
## zone11               0.82240726  0.29744016   2.765             0.005693 ** 
## zone12               0.79658491  0.19337052   4.119        0.00003797378 ***
## zone13               0.58993031  0.21505990   2.743             0.006086 ** 
## zone14               1.47733794  0.15153304   9.749 < 0.0000000000000002 ***
## zone16              -0.30376615  1.05330192  -0.288             0.773045    
## zone17               0.22697632  0.21207607   1.070             0.284503    
## pfx_total           -0.65308669  0.14905904  -4.381        0.00001179213 ***
## speed_change         0.05361032  0.02136886   2.509             0.012114 *  
## break_change         0.47050907  0.15243457   3.087             0.002024 ** 
## distance            -0.19574290  0.13277515  -1.474             0.140416    
## prev_pitchFastball  -0.14076785  0.04493543  -3.133             0.001732 ** 
## prev_pitchNone      -0.34439055  0.05348173  -6.439        0.00000000012 ***
## prev_pitchOff Speed -0.00714444  0.09788183  -0.073             0.941814    
## speed_fb_diff       -0.06463770  0.01186208  -5.449        0.00000005062 ***
## release_spin_rate    0.00037130  0.00009683   3.834             0.000126 ***
## I(plate_z^2)        -0.28655748  0.05043700  -5.681        0.00000001335 ***
## pred_bwhiff         18.65870926  0.93484343  19.959 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19130  on 22074  degrees of freedom
## Residual deviance: 17288  on 22044  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 17350
## 
## Number of Fisher Scoring iterations: 7
# Model Predictions
sl_preds_whiff <- sliders3 %>% 
  mutate(prediction_log = predict(sl_model_whiff, sliders3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ggplot(sl_preds_whiff, aes(x = prediction, y = as.numeric(whiff))) +
  geom_point(alpha = 0.01) +
  geom_smooth() +
  geom_smooth(method = "lm") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_bw()

WHIFF Fastball Initial Model

# Initial Model
fb_model_whiff_all <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + prev_pitch +
                release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bwhiff,
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_whiff_all)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + release_spin_rate + I(plate_x^2) + 
##     I(plate_z^2) + pred_bwhiff, family = binomial, data = fastballs3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)          -7.5816274 119.4715727  -0.063              0.94940    
## pitch_speed           0.0557803   0.0078739   7.084 0.000000000001398783 ***
## pfx_x                 0.6463242   0.0808860   7.991 0.000000000000001343 ***
## pfx_z                -1.1705641   0.1174326  -9.968 < 0.0000000000000002 ***
## plate_x              -0.0565644   0.0638425  -0.886              0.37562    
## plate_z               3.0664785   0.3787614   8.096 0.000000000000000568 ***
## zone1                 0.0957271   0.1201531   0.797              0.42562    
## zone2                 0.1247986   0.0970758   1.286              0.19859    
## zone3                 0.2886408   0.1126223   2.563              0.01038 *  
## zone4                -0.4660176   0.1183909  -3.936 0.000082760092236976 ***
## zone6                 0.1620347   0.0943015   1.718              0.08575 .  
## zone7                 0.1196145   0.1593547   0.751              0.45288    
## zone8                -0.2063144   0.1310306  -1.575              0.11536    
## zone9                -0.2041031   0.1431304  -1.426              0.15387    
## zone11                0.2641286   0.1442519   1.831              0.06710 .  
## zone12                0.0254171   0.1395724   0.182              0.85550    
## zone13                0.9422216   0.1658940   5.680 0.000000013496167951 ***
## zone14                0.3223702   0.1583034   2.036              0.04171 *  
## zone16               -1.8001997   0.3403438  -5.289 0.000000122747212274 ***
## zone17                1.1591083   0.4145653   2.796              0.00517 ** 
## count0_1             -6.6130700 119.4680996  -0.055              0.95586    
## count0_2             -6.8621109 119.4681077  -0.057              0.95420    
## count1_0             -6.7844000 119.4680778  -0.057              0.95471    
## count1_1             -6.5961524 119.4681027  -0.055              0.95597    
## count1_2             -6.6379474 119.4680981  -0.056              0.95569    
## count2_0             -6.8260742 119.4681281  -0.057              0.95444    
## count2_1             -6.5567532 119.4681139  -0.055              0.95623    
## count2_2             -6.6337006 119.4680999  -0.056              0.95572    
## count3_0             -8.6812066 119.4685239  -0.073              0.94207    
## count3_1             -6.8714283 119.4681450  -0.058              0.95413    
## count3_2             -6.8483351 119.4681093  -0.057              0.95429    
## pfx_total             0.8451315   0.1825259   4.630 0.000003653101190601 ***
## speed_change          0.0909285   0.0218399   4.163 0.000031353786683178 ***
## break_change          1.5237499   0.1692337   9.004 < 0.0000000000000002 ***
## distance              0.2734813   0.1723536   1.587              0.11257    
## prev_pitchFastball    0.0614724   0.0437354   1.406              0.15986    
## prev_pitchNone       -7.0729502 119.4680892  -0.059              0.95279    
## prev_pitchOff Speed   0.0356306   0.0847117   0.421              0.67404    
## release_spin_rate     0.0004958   0.0001154   4.295 0.000017442646480823 ***
## I(plate_x^2)         -0.7993284   0.0797653 -10.021 < 0.0000000000000002 ***
## I(plate_z^2)         -0.4638120   0.0712547  -6.509 0.000000000075547577 ***
## pred_bwhiff          17.4920563   0.6896532  25.364 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25315  on 42647  degrees of freedom
## Residual deviance: 22519  on 42606  degrees of freedom
##   (129 observations deleted due to missingness)
## AIC: 22603
## 
## Number of Fisher Scoring iterations: 9

WHIFF Fastball Refined Model & Predictions

# Refined Model
fb_model_whiff <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              pfx_total + speed_change + break_change + distance + prev_pitch +
                release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bwhiff,
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_whiff)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + release_spin_rate + I(plate_x^2) + 
##     I(plate_z^2) + pred_bwhiff, family = binomial, data = fastballs3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -14.2154028   0.9000199 -15.795 < 0.0000000000000002 ***
## pitch_speed           0.0550092   0.0078550   7.003 0.000000000002503755 ***
## pfx_x                 0.6338384   0.0806196   7.862 0.000000000000003778 ***
## pfx_z                -1.1561825   0.1172466  -9.861 < 0.0000000000000002 ***
## plate_x              -0.0615590   0.0636244  -0.968              0.33328    
## plate_z               3.0392831   0.3777182   8.046 0.000000000000000852 ***
## zone1                 0.0929298   0.1199153   0.775              0.43836    
## zone2                 0.1201440   0.0968204   1.241              0.21464    
## zone3                 0.2975429   0.1123314   2.649              0.00808 ** 
## zone4                -0.4653376   0.1182299  -3.936 0.000082895313099199 ***
## zone6                 0.1659789   0.0940813   1.764              0.07770 .  
## zone7                 0.1231180   0.1591132   0.774              0.43906    
## zone8                -0.2125384   0.1309014  -1.624              0.10445    
## zone9                -0.2114364   0.1430251  -1.478              0.13932    
## zone11                0.2615133   0.1440152   1.816              0.06939 .  
## zone12                0.0279241   0.1394753   0.200              0.84132    
## zone13                0.9459586   0.1657187   5.708 0.000000011416409673 ***
## zone14                0.3223060   0.1581047   2.039              0.04149 *  
## zone16               -1.8128981   0.3402077  -5.329 0.000000098864591113 ***
## zone17                1.1355316   0.4142803   2.741              0.00613 ** 
## pfx_total             0.8280238   0.1821148   4.547 0.000005449044810380 ***
## speed_change          0.1002090   0.0209493   4.783 0.000001723549942532 ***
## break_change          1.5058941   0.1686232   8.931 < 0.0000000000000002 ***
## distance              0.2734463   0.1719881   1.590              0.11185    
## prev_pitchFastball    0.0332318   0.0433347   0.767              0.44316    
## prev_pitchNone       -0.3721674   0.0522667  -7.121 0.000000000001075026 ***
## prev_pitchOff Speed   0.0288480   0.0841504   0.343              0.73174    
## release_spin_rate     0.0005103   0.0001152   4.431 0.000009365819796711 ***
## I(plate_x^2)         -0.7876100   0.0794838  -9.909 < 0.0000000000000002 ***
## I(plate_z^2)         -0.4586255   0.0710512  -6.455 0.000000000108317436 ***
## pred_bwhiff          17.5208378   0.6871512  25.498 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25315  on 42647  degrees of freedom
## Residual deviance: 22622  on 42617  degrees of freedom
##   (129 observations deleted due to missingness)
## AIC: 22684
## 
## Number of Fisher Scoring iterations: 7
# Model Predictions
fb_preds_whiff <- fastballs3 %>% 
  mutate(prediction_log = predict(fb_model_whiff, fastballs3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ggplot(fb_preds_whiff, aes(x = prediction, y = as.numeric(whiff))) +
  geom_point(alpha = 0.01) +
  geom_smooth() +
  geom_smooth(method = "lm") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_bw()


Predicted barrel rate by batter

BARREL Slider Initial Model

# Initial Model
sl_model_barrel_all <- glm(is_barrel ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + prev_pitch +
                speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + release_spin_rate +
                I(plate_x^2) + I(plate_z^2) + pred_bbarrel,
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_barrel_all)
## 
## Call:
## glm(formula = is_barrel ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + 
##     release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bbarrel, 
##     family = binomial, data = sliders3)
## 
## Coefficients:
##                            Estimate      Std. Error z value
## (Intercept)             7.293875026 18646.259525496   0.000
## pitch_speed            -0.081909424     0.031449146  -2.605
## pfx_x                  -0.453897008     0.456442906  -0.994
## pfx_z                  -0.362468383     0.305897768  -1.185
## plate_x                -0.546581403     0.420562756  -1.300
## plate_z                 6.937112624     2.582609967   2.686
## zone1                  -0.393990464     0.580685974  -0.678
## zone2                  -0.188618258     0.399110399  -0.473
## zone3                   0.074022838     0.515658794   0.144
## zone4                   0.184051292     0.360808591   0.510
## zone6                   0.214614109     0.308348434   0.696
## zone7                   0.678258744     0.450161697   1.507
## zone8                  -0.162705876     0.331497844  -0.491
## zone9                  -0.213937320     0.405685694  -0.527
## zone11                 -0.134046455     0.959872134  -0.140
## zone12                 -0.018617622     0.887775004  -0.021
## zone13                 -1.397251651     1.138631154  -1.227
## zone14                 -1.415418799     0.793204765  -1.784
## zone16                -10.692157389  2047.215128740  -0.005
## zone17                 -9.660105428   338.209372556  -0.029
## count0_1              -13.211829787 18646.259012691  -0.001
## count0_2              -12.504457471 18646.259013646  -0.001
## count1_0              -13.120863562 18646.259010632  -0.001
## count1_1              -13.207366384 18646.259012947  -0.001
## count1_2              -12.772845644 18646.259012899  -0.001
## count2_0              -13.859592726 18646.259021090  -0.001
## count2_1              -12.564168650 18646.259013105  -0.001
## count2_2              -12.681792595 18646.259013041  -0.001
## count3_0              -30.850483170 19171.044211973  -0.002
## count3_1              -12.707591888 18646.259017700  -0.001
## count3_2              -12.336059230 18646.259013132  -0.001
## pfx_total               0.328569112     0.471870190   0.696
## speed_change           -0.146712387     0.073743383  -1.989
## break_change           -0.548699599     0.491310983  -1.117
## distance               -0.234443556     0.769579412  -0.305
## prev_pitchFastball      0.118490700     0.152430096   0.777
## prev_pitchNone        -13.373199243 18646.259011235  -0.001
## prev_pitchOff Speed     0.061249736     0.317536347   0.193
## speed_fb_diff           0.097073177     0.045164905   2.149
## pfx_x_fb_diff          -0.041980060     0.222089094  -0.189
## pfx_z_fb_diff          -0.027667102     0.223107751  -0.124
## release_spin_rate      -0.000007429     0.000340174  -0.022
## I(plate_x^2)           -2.073882655     0.703505253  -2.948
## I(plate_z^2)           -1.425644038     0.522422545  -2.729
## pred_bbarrel          162.123058716    11.221906112  14.447
##                                 Pr(>|z|)    
## (Intercept)                      0.99969    
## pitch_speed                      0.00920 ** 
## pfx_x                            0.32002    
## pfx_z                            0.23604    
## plate_x                          0.19372    
## plate_z                          0.00723 ** 
## zone1                            0.49746    
## zone2                            0.63650    
## zone3                            0.88586    
## zone4                            0.60998    
## zone6                            0.48642    
## zone7                            0.13189    
## zone8                            0.62355    
## zone9                            0.59795    
## zone11                           0.88894    
## zone12                           0.98327    
## zone13                           0.21977    
## zone14                           0.07435 .  
## zone16                           0.99583    
## zone17                           0.97721    
## count0_1                         0.99943    
## count0_2                         0.99946    
## count1_0                         0.99944    
## count1_1                         0.99943    
## count1_2                         0.99945    
## count2_0                         0.99941    
## count2_1                         0.99946    
## count2_2                         0.99946    
## count3_0                         0.99872    
## count3_1                         0.99946    
## count3_2                         0.99947    
## pfx_total                        0.48623    
## speed_change                     0.04665 *  
## break_change                     0.26408    
## distance                         0.76064    
## prev_pitchFastball               0.43696    
## prev_pitchNone                   0.99943    
## prev_pitchOff Speed              0.84704    
## speed_fb_diff                    0.03161 *  
## pfx_x_fb_diff                    0.85007    
## pfx_z_fb_diff                    0.90131    
## release_spin_rate                0.98258    
## I(plate_x^2)                     0.00320 ** 
## I(plate_z^2)                     0.00635 ** 
## pred_bbarrel        < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2844.2  on 22074  degrees of freedom
## Residual deviance: 2101.2  on 22030  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 2191.2
## 
## Number of Fisher Scoring iterations: 20

BARREL Slider Refined Model & Predictions

# Refined Model
sl_model_barrel <- glm(is_barrel ~ pitch_speed + plate_z + zone +
              prev_pitch + speed_fb_diff + I(plate_x^2) + I(plate_z^2) + pred_bbarrel,
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_barrel)
## 
## Call:
## glm(formula = is_barrel ~ pitch_speed + plate_z + zone + prev_pitch + 
##     speed_fb_diff + I(plate_x^2) + I(plate_z^2) + pred_bbarrel, 
##     family = binomial, data = sliders3)
## 
## Coefficients:
##                       Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)           -6.01014    3.32848  -1.806              0.07097 .  
## pitch_speed           -0.08675    0.02824  -3.072              0.00213 ** 
## plate_z                7.42214    1.67011   4.444           0.00000883 ***
## zone1                 -0.10985    0.49604  -0.221              0.82474    
## zone2                 -0.23491    0.35924  -0.654              0.51318    
## zone3                 -0.19214    0.44340  -0.433              0.66478    
## zone4                  0.48358    0.25840   1.871              0.06129 .  
## zone6                 -0.07541    0.23326  -0.323              0.74648    
## zone7                  0.89451    0.33597   2.662              0.00776 ** 
## zone8                 -0.22664    0.27866  -0.813              0.41603    
## zone9                 -0.50425    0.32297  -1.561              0.11846    
## zone11                 0.19732    0.86422   0.228              0.81940    
## zone12                -0.40796    0.84107  -0.485              0.62765    
## zone13                -1.05222    1.10086  -0.956              0.33916    
## zone14                -1.78436    0.75086  -2.376              0.01748 *  
## zone16               -10.32459 2114.92382  -0.005              0.99610    
## zone17               -10.04117  358.95710  -0.028              0.97768    
## prev_pitchFastball     0.12237    0.15035   0.814              0.41570    
## prev_pitchNone        -0.42001    0.17970  -2.337              0.01942 *  
## prev_pitchOff Speed    0.12976    0.31370   0.414              0.67913    
## speed_fb_diff          0.08374    0.03580   2.339              0.01932 *  
## I(plate_x^2)          -2.37326    0.49198  -4.824           0.00000141 ***
## I(plate_z^2)          -1.52392    0.34552  -4.410           0.00001031 ***
## pred_bbarrel         160.34359   11.04768  14.514 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2844.6  on 22092  degrees of freedom
## Residual deviance: 2135.5  on 22069  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 2183.5
## 
## Number of Fisher Scoring iterations: 20
# Model Predictions
sl_preds_barrel <- sliders3 %>% 
  mutate(prediction_log = predict(sl_model_barrel, sliders3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ggplot(sl_preds_barrel, aes(x = prediction, y = as.numeric(is_barrel))) +
  geom_point(alpha = 0.01) +
  geom_smooth() +
  geom_smooth(method = "lm") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_bw()

BARREL Fastball Initial Model

# Initial Model
fb_model_barrel_all <- glm(is_barrel ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + prev_pitch +
                release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bbarrel,
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_barrel_all)
## 
## Call:
## glm(formula = is_barrel ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + release_spin_rate + I(plate_x^2) + 
##     I(plate_z^2) + pred_bbarrel, family = binomial, data = fastballs3)
## 
## Coefficients:
##                          Estimate    Std. Error z value             Pr(>|z|)
## (Intercept)            14.1690697 17730.3514959   0.001              0.99936
## pitch_speed            -0.0908045     0.0176352  -5.149          0.000000262
## pfx_x                  -0.1107500     0.2061660  -0.537              0.59114
## pfx_z                   0.3520933     0.2950100   1.193              0.23268
## plate_x                -0.3151407     0.2052985  -1.535              0.12477
## plate_z                 5.5502796     1.2987011   4.274          0.000019224
## zone1                  -0.1820119     0.2647474  -0.687              0.49177
## zone2                   0.0840930     0.1886724   0.446              0.65581
## zone3                   0.1178921     0.2595632   0.454              0.64969
## zone4                   0.3193995     0.1881786   1.697              0.08964
## zone6                   0.1669336     0.1875701   0.890              0.37348
## zone7                   0.4791432     0.2904666   1.650              0.09903
## zone8                  -0.1615013     0.2334322  -0.692              0.48903
## zone9                  -0.3147077     0.3116269  -1.010              0.31255
## zone11                  0.0296693     0.3580654   0.083              0.93396
## zone12                  0.4714067     0.3472315   1.358              0.17459
## zone13                  0.5349522     0.4132797   1.294              0.19552
## zone14                -14.3917040   221.4000680  -0.065              0.94817
## zone16                -11.6632825   424.4048236  -0.027              0.97808
## zone17                -10.1341457   526.2729106  -0.019              0.98464
## count0_1              -16.7659764 17730.3513307  -0.001              0.99925
## count0_2              -16.6629977 17730.3513313  -0.001              0.99925
## count1_0              -16.7757680 17730.3513299  -0.001              0.99925
## count1_1              -16.5670098 17730.3513307  -0.001              0.99925
## count1_2              -16.4600242 17730.3513308  -0.001              0.99926
## count2_0              -16.3770285 17730.3513311  -0.001              0.99926
## count2_1              -16.3456247 17730.3513309  -0.001              0.99926
## count2_2              -16.1580357 17730.3513306  -0.001              0.99927
## count3_0              -16.9047437 17730.3513329  -0.001              0.99924
## count3_1              -16.1803143 17730.3513313  -0.001              0.99927
## count3_2              -16.2369068 17730.3513307  -0.001              0.99927
## pfx_total              -0.1464570     0.4351920  -0.337              0.73647
## speed_change           -0.0964585     0.0456430  -2.113              0.03457
## break_change           -0.9368259     0.3748858  -2.499              0.01246
## distance               -0.3962217     0.4272619  -0.927              0.35374
## prev_pitchFastball      0.0625169     0.0952142   0.657              0.51144
## prev_pitchNone        -17.0356334 17730.3513303  -0.001              0.99923
## prev_pitchOff Speed    -0.1473366     0.1822982  -0.808              0.41897
## release_spin_rate      -0.0006799     0.0002429  -2.799              0.00512
## I(plate_x^2)           -1.5757072     0.3077263  -5.120          0.000000305
## I(plate_z^2)           -1.0701619     0.2482737  -4.310          0.000016295
## pred_bbarrel          109.1558372     5.8158491  18.769 < 0.0000000000000002
##                        
## (Intercept)            
## pitch_speed         ***
## pfx_x                  
## pfx_z                  
## plate_x                
## plate_z             ***
## zone1                  
## zone2                  
## zone3                  
## zone4               .  
## zone6                  
## zone7               .  
## zone8                  
## zone9                  
## zone11                 
## zone12                 
## zone13                 
## zone14                 
## zone16                 
## zone17                 
## count0_1               
## count0_2               
## count1_0               
## count1_1               
## count1_2               
## count2_0               
## count2_1               
## count2_2               
## count3_0               
## count3_1               
## count3_2               
## pfx_total              
## speed_change        *  
## break_change        *  
## distance               
## prev_pitchFastball     
## prev_pitchNone         
## prev_pitchOff Speed    
## release_spin_rate   ** 
## I(plate_x^2)        ***
## I(plate_z^2)        ***
## pred_bbarrel        ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7337.6  on 42647  degrees of freedom
## Residual deviance: 6177.1  on 42606  degrees of freedom
##   (129 observations deleted due to missingness)
## AIC: 6261.1
## 
## Number of Fisher Scoring iterations: 19

BARREL Fastball Refined Model & Predictions

# Refined Model
fb_model_barrel <- glm(is_barrel ~ pitch_speed + pfx_z + plate_z + zone +
              break_change + prev_pitch +
                release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bbarrel,
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_barrel)
## 
## Call:
## glm(formula = is_barrel ~ pitch_speed + pfx_z + plate_z + zone + 
##     break_change + prev_pitch + release_spin_rate + I(plate_x^2) + 
##     I(plate_z^2) + pred_bbarrel, family = binomial, data = fastballs3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)          -3.3077526   1.8946710  -1.746              0.08084 .  
## pitch_speed          -0.0944753   0.0157784  -5.988 0.000000002129105663 ***
## pfx_z                 0.2619646   0.1146813   2.284              0.02235 *  
## plate_z               6.6124560   0.8723678   7.580 0.000000000000034584 ***
## zone1                -0.1332453   0.2135455  -0.624              0.53265    
## zone2                -0.0091205   0.1572915  -0.058              0.95376    
## zone3                -0.1650322   0.2016200  -0.819              0.41305    
## zone4                 0.4150531   0.1427792   2.907              0.00365 ** 
## zone6                -0.0599910   0.1406864  -0.426              0.66980    
## zone7                 0.5368172   0.2333650   2.300              0.02143 *  
## zone8                -0.2251755   0.2084197  -1.080              0.27997    
## zone9                -0.5748546   0.2686229  -2.140              0.03235 *  
## zone11                0.0951644   0.3029716   0.314              0.75344    
## zone12                0.1191472   0.2829984   0.421              0.67374    
## zone13                0.6782115   0.3473910   1.952              0.05090 .  
## zone14              -14.7673307 220.5172265  -0.067              0.94661    
## zone16              -11.5851355 427.2020747  -0.027              0.97837    
## zone17              -10.2612927 539.2332259  -0.019              0.98482    
## break_change         -1.0184622   0.3284472  -3.101              0.00193 ** 
## prev_pitchFastball    0.0412383   0.0941765   0.438              0.66147    
## prev_pitchNone       -0.5083031   0.1124689  -4.519 0.000006198651259736 ***
## prev_pitchOff Speed  -0.0781888   0.1806309  -0.433              0.66511    
## release_spin_rate    -0.0007461   0.0002314  -3.224              0.00126 ** 
## I(plate_x^2)         -1.7720270   0.2178009  -8.136 0.000000000000000409 ***
## I(plate_z^2)         -1.2750311   0.1584656  -8.046 0.000000000000000855 ***
## pred_bbarrel        108.2006073   5.7652900  18.768 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7339.0  on 42689  degrees of freedom
## Residual deviance: 6211.6  on 42664  degrees of freedom
##   (87 observations deleted due to missingness)
## AIC: 6263.6
## 
## Number of Fisher Scoring iterations: 19
# Model Predictions
fb_preds_barrel <- fastballs3 %>% 
  mutate(prediction_log = predict(fb_model_barrel, fastballs3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ggplot(fb_preds_barrel, aes(x = prediction, y = as.numeric(is_barrel))) +
  geom_point(alpha = 0.01) +
  geom_smooth() +
  geom_smooth(method = "lm") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_bw()


Predicted strike rate by batter

STRIKE Slider Initial Model

# Initial Model
sl_model_strike_all <- glm(is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + prev_pitch +
                speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + release_spin_rate +
                I(plate_x^2) + I(plate_z^2),
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_strike_all)
## 
## Call:
## glm(formula = is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + 
##     release_spin_rate + I(plate_x^2) + I(plate_z^2), family = binomial, 
##     data = sliders3)
## 
## Coefficients:
##                         Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)           4.75024001  79.39850610   0.060              0.95229    
## pitch_speed           0.00925384   0.00791292   1.169              0.24222    
## pfx_x                 0.25808837   0.11130676   2.319              0.02041 *  
## pfx_z                 0.11982747   0.07441403   1.610              0.10734    
## plate_x               0.46897830   0.09065777   5.173  0.00000023028778269 ***
## plate_z               2.45280263   0.22575675  10.865 < 0.0000000000000002 ***
## zone1                 1.19664853   0.15411546   7.765  0.00000000000000819 ***
## zone2                 0.67612487   0.11719114   5.769  0.00000000795450029 ***
## zone3                 0.55793916   0.13179678   4.233  0.00002302569312602 ***
## zone4                 0.49378629   0.12135859   4.069  0.00004725174229177 ***
## zone6                -0.03804441   0.09187810  -0.414              0.67882    
## zone7                 0.47201047   0.14628566   3.227              0.00125 ** 
## zone8                 0.05410273   0.09741951   0.555              0.57865    
## zone9                 0.12880167   0.10419625   1.236              0.21640    
## zone11               -0.48623687   0.21247328  -2.288              0.02211 *  
## zone12               -0.45361904   0.14641379  -3.098              0.00195 ** 
## zone13               -0.48151283   0.16669678  -2.889              0.00387 ** 
## zone14               -0.16530831   0.12688952  -1.303              0.19265    
## zone16               -1.81037578   1.03597979  -1.748              0.08055 .  
## zone17               -0.26649423   0.17147450  -1.554              0.12015    
## count0_1            -10.43100365  79.39479176  -0.131              0.89547    
## count0_2            -10.73412348  79.39480416  -0.135              0.89245    
## count1_0             -9.98768532  79.39476602  -0.126              0.89989    
## count1_1            -10.32495397  79.39479497  -0.130              0.89653    
## count1_2            -10.68438113  79.39479772  -0.135              0.89295    
## count2_0             -9.68952636  79.39485026  -0.122              0.90287    
## count2_1            -10.22369978  79.39481514  -0.129              0.89754    
## count2_2            -10.72622292  79.39480296  -0.135              0.89253    
## count3_0             -9.25413212  79.39567788  -0.117              0.90721    
## count3_1            -10.08062665  79.39492292  -0.127              0.89897    
## count3_2            -10.64873659  79.39482608  -0.134              0.89330    
## pfx_total            -0.52072766   0.11713484  -4.446  0.00000876712754740 ***
## speed_change          0.03223030   0.01723334   1.870              0.06145 .  
## break_change          0.28490650   0.12116882   2.351              0.01871 *  
## distance              0.85074453   0.13280061   6.406  0.00000000014921128 ***
## prev_pitchFastball   -0.16188360   0.03758216  -4.307  0.00001651408316119 ***
## prev_pitchNone       -9.88344393  79.39477706  -0.124              0.90093    
## prev_pitchOff Speed  -0.03574840   0.08399368  -0.426              0.67039    
## speed_fb_diff        -0.06257527   0.01042094  -6.005  0.00000000191614724 ***
## pfx_x_fb_diff         0.00500042   0.05148048   0.097              0.92262    
## pfx_z_fb_diff         0.03899110   0.05179426   0.753              0.45157    
## release_spin_rate     0.00034815   0.00007784   4.473  0.00000772983997455 ***
## I(plate_x^2)         -0.90577992   0.06212935 -14.579 < 0.0000000000000002 ***
## I(plate_z^2)         -0.55759221   0.04897547 -11.385 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27338  on 22074  degrees of freedom
## Residual deviance: 24690  on 22031  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 24778
## 
## Number of Fisher Scoring iterations: 9

STRIKE Slider Refined Model & Predictions

# Refined Model

sliders3 <- sliders3 %>% 
  mutate(fb_prev = ifelse(prev_pitch == "Fastball", 1, 0)) %>% 
  mutate(count_s = case_when(count  %in% c("0_0", "1_1", "2_2") ~ "Even",
                             count  %in%  c("1_0", "2_1", "2_0", "3_0", "3_1") ~ "Hitter",
                             count %in%  c("0_1", "0_2", "1_2") ~ "Pitcher",
                             count == "3_2" ~ "Full"))

sl_model_strike <- glm(is_strike ~ pfx_x + pfx_z + plate_x + plate_z + zone +
              count_s + pfx_total + break_change + distance + prev_pitch +
                speed_fb_diff + release_spin_rate +
                I(plate_x^2) + I(plate_z^2),
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_strike)
## 
## Call:
## glm(formula = is_strike ~ pfx_x + pfx_z + plate_x + plate_z + 
##     zone + count_s + pfx_total + break_change + distance + prev_pitch + 
##     speed_fb_diff + release_spin_rate + I(plate_x^2) + I(plate_z^2), 
##     family = binomial, data = sliders3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -4.95025380  0.37550196 -13.183 < 0.0000000000000002 ***
## pfx_x                0.23839221  0.10328061   2.308              0.02099 *  
## pfx_z                0.18491137  0.06590690   2.806              0.00502 ** 
## plate_x              0.46386633  0.09033130   5.135   0.0000002818933587 ***
## plate_z              2.45803270  0.22543449  10.904 < 0.0000000000000002 ***
## zone1                1.19268424  0.15378690   7.755   0.0000000000000088 ***
## zone2                0.66944751  0.11694487   5.724   0.0000000103756699 ***
## zone3                0.55258438  0.13153879   4.201   0.0000265828442377 ***
## zone4                0.49157300  0.12118791   4.056   0.0000498588881342 ***
## zone6               -0.04426440  0.09169664  -0.483              0.62929    
## zone7                0.47195408  0.14588479   3.235              0.00122 ** 
## zone8                0.05236311  0.09726019   0.538              0.59031    
## zone9                0.13024189  0.10397632   1.253              0.21035    
## zone11              -0.45938264  0.21207454  -2.166              0.03030 *  
## zone12              -0.44645176  0.14612246  -3.055              0.00225 ** 
## zone13              -0.48725550  0.16628431  -2.930              0.00339 ** 
## zone14              -0.16279605  0.12661978  -1.286              0.19855    
## zone16              -1.80640227  1.03557477  -1.744              0.08110 .  
## zone17              -0.26112200  0.17099826  -1.527              0.12675    
## count_sFull         -0.15783734  0.09108560  -1.733              0.08312 .  
## count_sHitter        0.46310520  0.05118051   9.048 < 0.0000000000000002 ***
## count_sPitcher      -0.09325065  0.04506279  -2.069              0.03851 *  
## pfx_total           -0.51880950  0.11441163  -4.535   0.0000057716281996 ***
## break_change         0.28892243  0.11733979   2.462              0.01381 *  
## distance             0.83850669  0.13257275   6.325   0.0000000002534302 ***
## prev_pitchFastball  -0.14942047  0.03734628  -4.001   0.0000630896086984 ***
## prev_pitchNone       0.59426251  0.05044210  11.781 < 0.0000000000000002 ***
## prev_pitchOff Speed -0.07509416  0.08330675  -0.901              0.36737    
## speed_fb_diff       -0.05304525  0.00805446  -6.586   0.0000000000452374 ***
## release_spin_rate    0.00037446  0.00007346   5.097   0.0000003445692910 ***
## I(plate_x^2)        -0.90294934  0.06193532 -14.579 < 0.0000000000000002 ***
## I(plate_z^2)        -0.56083909  0.04890608 -11.468 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27338  on 22074  degrees of freedom
## Residual deviance: 24773  on 22043  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 24837
## 
## Number of Fisher Scoring iterations: 7
# Model Predictions
sl_preds_strike <- sliders3 %>% 
  mutate(prediction_log = predict(sl_model_strike, sliders3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ggplot(sl_preds_strike, aes(x = prediction, y = as.numeric(is_strike))) +
  geom_point(alpha = 0.01) +
  geom_smooth() +
  geom_smooth(method = "lm") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_bw()

STRIKE Fastball Initial Model

# Initial Model
fb_model_strike_all <- glm(is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + prev_pitch +
                release_spin_rate + I(plate_x^2) + I(plate_z^2),
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_strike_all)
## 
## Call:
## glm(formula = is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + speed_change + break_change + 
##     distance + prev_pitch + release_spin_rate + I(plate_x^2) + 
##     I(plate_z^2), family = binomial, data = fastballs3)
## 
## Coefficients:
##                         Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)          -0.24780405  72.46554118  -0.003             0.997272    
## pitch_speed           0.02129738   0.00528230   4.032  0.00005534201621050 ***
## pfx_x                 0.18963848   0.05675477   3.341             0.000834 ***
## pfx_z                -0.31760698   0.07655709  -4.149  0.00003344721864518 ***
## plate_x               0.37177007   0.04778935   7.779  0.00000000000000729 ***
## plate_z               5.08173312   0.27502030  18.478 < 0.0000000000000002 ***
## zone1                 0.12145573   0.08266136   1.469             0.141747    
## zone2                -0.02041481   0.06564328  -0.311             0.755804    
## zone3                 0.43666709   0.07582591   5.759  0.00000000847085094 ***
## zone4                -0.07727690   0.06987081  -1.106             0.268728    
## zone6                 0.65337265   0.06014767  10.863 < 0.0000000000000002 ***
## zone7                 0.94310736   0.08985345  10.496 < 0.0000000000000002 ***
## zone8                 0.70740731   0.07008825  10.093 < 0.0000000000000002 ***
## zone9                 1.36044590   0.07962648  17.085 < 0.0000000000000002 ***
## zone11                0.05481168   0.10141625   0.540             0.588878    
## zone12               -0.24177778   0.09350028  -2.586             0.009714 ** 
## zone13                0.36667792   0.11207328   3.272             0.001069 ** 
## zone14               -0.14945135   0.09826357  -1.521             0.128279    
## zone16               -0.94110617   0.30475586  -3.088             0.002015 ** 
## zone17               -0.27910981   0.37074139  -0.753             0.451545    
## count0_1            -10.20349183  72.46288909  -0.141             0.888020    
## count0_2            -10.64342382  72.46290162  -0.147             0.883226    
## count1_0             -9.89620629  72.46287376  -0.137             0.891371    
## count1_1            -10.34154275  72.46289181  -0.143             0.886515    
## count1_2            -10.58996475  72.46289249  -0.146             0.883808    
## count2_0            -10.00139783  72.46290371  -0.138             0.890224    
## count2_1            -10.47433734  72.46290143  -0.145             0.885068    
## count2_2            -10.78048270  72.46289400  -0.149             0.881733    
## count3_0             -8.90730217  72.46293470  -0.123             0.902169    
## count3_1            -10.49705741  72.46291931  -0.145             0.884821    
## count3_2            -11.05790860  72.46290199  -0.153             0.878713    
## pfx_total             0.35263722   0.12021641   2.933             0.003353 ** 
## speed_change          0.01966765   0.01396969   1.408             0.159167    
## break_change          0.88355297   0.11146971   7.926  0.00000000000000226 ***
## distance              0.78333765   0.11647205   6.726  0.00000000001749404 ***
## prev_pitchFastball    0.06055842   0.03134423   1.932             0.053354 .  
## prev_pitchNone       -9.52896492  72.46288246  -0.132             0.895379    
## prev_pitchOff Speed   0.06842237   0.06020190   1.137             0.255727    
## release_spin_rate     0.00034884   0.00007543   4.625  0.00000375390785099 ***
## I(plate_x^2)         -1.33742770   0.06047928 -22.114 < 0.0000000000000002 ***
## I(plate_z^2)         -0.95146455   0.05341929 -17.811 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 52121  on 42647  degrees of freedom
## Residual deviance: 43719  on 42607  degrees of freedom
##   (129 observations deleted due to missingness)
## AIC: 43801
## 
## Number of Fisher Scoring iterations: 8

STRIKE Fastball Refined Model & Predictions

# Refined Model

fb_model_strike <- glm(is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + break_change + distance + 
                release_spin_rate + I(plate_x^2) + I(plate_z^2),
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_strike)
## 
## Call:
## glm(formula = is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + break_change + distance + 
##     release_spin_rate + I(plate_x^2) + I(plate_z^2), family = binomial, 
##     data = fastballs3)
## 
## Coefficients:
##                       Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       -10.01959874   0.59247854 -16.911 < 0.0000000000000002 ***
## pitch_speed         0.02377242   0.00499255   4.762 0.000001920851941636 ***
## pfx_x               0.18436160   0.05667941   3.253             0.001143 ** 
## pfx_z              -0.31509822   0.07641856  -4.123 0.000037344943541671 ***
## plate_x             0.37389991   0.04774599   7.831 0.000000000000004839 ***
## plate_z             5.09165838   0.27486149  18.524 < 0.0000000000000002 ***
## zone1               0.12371525   0.08258168   1.498             0.134108    
## zone2              -0.02048224   0.06558575  -0.312             0.754815    
## zone3               0.43733275   0.07575427   5.773 0.000000007785190804 ***
## zone4              -0.07663348   0.06980848  -1.098             0.272306    
## zone6               0.65471683   0.06008547  10.896 < 0.0000000000000002 ***
## zone7               0.94474809   0.08976595  10.525 < 0.0000000000000002 ***
## zone8               0.70972523   0.07005044  10.132 < 0.0000000000000002 ***
## zone9               1.36380395   0.07958839  17.136 < 0.0000000000000002 ***
## zone11              0.05476433   0.10129449   0.541             0.588752    
## zone12             -0.24069515   0.09339496  -2.577             0.009961 ** 
## zone13              0.36887777   0.11199130   3.294             0.000988 ***
## zone14             -0.14452084   0.09818047  -1.472             0.141023    
## zone16             -0.94065180   0.30465849  -3.088             0.002018 ** 
## zone17             -0.27644903   0.37068180  -0.746             0.455797    
## count0_1           -0.63099114   0.03972981 -15.882 < 0.0000000000000002 ***
## count0_2           -1.05471515   0.06096876 -17.299 < 0.0000000000000002 ***
## count1_0           -0.32445905   0.04035505  -8.040 0.000000000000000898 ***
## count1_1           -0.77088056   0.04466859 -17.258 < 0.0000000000000002 ***
## count1_2           -1.01267040   0.04930077 -20.541 < 0.0000000000000002 ***
## count2_0           -0.43199652   0.05982086  -7.222 0.000000000000514163 ***
## count2_1           -0.89747292   0.05796989 -15.482 < 0.0000000000000002 ***
## count2_2           -1.20077475   0.05090482 -23.589 < 0.0000000000000002 ***
## count3_0            0.66130927   0.08854101   7.469 0.000000000000080831 ***
## count3_1           -0.91844068   0.07698930 -11.929 < 0.0000000000000002 ***
## count3_2           -1.47085921   0.06132638 -23.984 < 0.0000000000000002 ***
## pfx_total           0.34150808   0.12009437   2.844             0.004460 ** 
## break_change        0.90602065   0.11095569   8.166 0.000000000000000320 ***
## distance            0.78405900   0.11636185   6.738 0.000000000016045948 ***
## release_spin_rate   0.00035075   0.00007507   4.673 0.000002974568745848 ***
## I(plate_x^2)       -1.33736701   0.06042433 -22.133 < 0.0000000000000002 ***
## I(plate_z^2)       -0.95346597   0.05338985 -17.859 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 52181  on 42689  degrees of freedom
## Residual deviance: 43777  on 42653  degrees of freedom
##   (87 observations deleted due to missingness)
## AIC: 43851
## 
## Number of Fisher Scoring iterations: 7
# Model Predictions
fb_preds_strike <- fastballs3 %>% 
  mutate(prediction_log = predict(fb_model_strike, fastballs3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

ggplot(fb_preds_strike, aes(x = prediction, y = as.numeric(is_strike))) +
  geom_point(alpha = 0.01) +
  geom_smooth() +
  geom_smooth(method = "lm") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_bw()

Slider Initial Model

# Initial Model
sl_model_all <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + 
                prev_pitch + speed_fb_diff + pfx_x_fb_diff + pfx_z_fb_diff + release_spin_rate +
                I(plate_x^2) + I(plate_z^2),
              data = sliders3, family = binomial)

# Model Evaluation
# summary(sl_model_all)

Slider Refined Model & Predictions

sl_model <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              pfx_total + speed_change + break_change + distance + release_spin_rate +
                prev_pitch + speed_fb_diff + I(plate_x^2) + I(plate_z^2),
              data = sliders3, family = binomial)
sl_model %>% summary
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + pfx_total + speed_change + break_change + 
##     distance + release_spin_rate + prev_pitch + speed_fb_diff + 
##     I(plate_x^2) + I(plate_z^2), family = binomial, data = sliders3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -8.21621327  0.92873085  -8.847 < 0.0000000000000002 ***
## pitch_speed          0.01379833  0.00950387   1.452             0.146539    
## pfx_x                0.25569580  0.13139319   1.946             0.051650 .  
## pfx_z                0.42842356  0.08683887   4.934      0.0000008074996 ***
## plate_x              0.74531223  0.12539349   5.944      0.0000000027851 ***
## plate_z              3.09714611  0.28408297  10.902 < 0.0000000000000002 ***
## zone1                0.38978765  0.25859110   1.507             0.131721    
## zone2                0.44246683  0.18335350   2.413             0.015814 *  
## zone3               -0.19221608  0.20819683  -0.923             0.355881    
## zone4               -0.70481946  0.26992612  -2.611             0.009024 ** 
## zone6               -0.27504771  0.14374089  -1.913             0.055684 .  
## zone7               -0.16599387  0.24143054  -0.688             0.491741    
## zone8                0.13680416  0.14367804   0.952             0.341017    
## zone9                0.18917878  0.14742171   1.283             0.199405    
## zone11               0.73602843  0.30001116   2.453             0.014154 *  
## zone12              -0.49401425  0.20861735  -2.368             0.017883 *  
## zone13               0.28805261  0.21217246   1.358             0.174580    
## zone14               0.35478445  0.16987034   2.089             0.036747 *  
## zone16              -0.50459713  1.04806302  -0.481             0.630192    
## zone17              -0.43662410  0.21459306  -2.035             0.041885 *  
## pfx_total           -0.63699864  0.14815625  -4.300      0.0000171179534 ***
## speed_change         0.06769027  0.02117828   3.196             0.001392 ** 
## break_change         0.49887128  0.15124336   3.298             0.000972 ***
## distance             1.40199266  0.16784357   8.353 < 0.0000000000000002 ***
## release_spin_rate    0.00033421  0.00009603   3.480             0.000501 ***
## prev_pitchFastball  -0.17559463  0.04455425  -3.941      0.0000810946499 ***
## prev_pitchNone      -0.35137282  0.05301926  -6.627      0.0000000000342 ***
## prev_pitchOff Speed -0.07831449  0.09720548  -0.806             0.420439    
## speed_fb_diff       -0.06485859  0.01173577  -5.527      0.0000000326550 ***
## I(plate_x^2)        -1.00618680  0.07828916 -12.852 < 0.0000000000000002 ***
## I(plate_z^2)        -0.72377372  0.06392984 -11.321 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19130  on 22074  degrees of freedom
## Residual deviance: 17494  on 22044  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 17556
## 
## Number of Fisher Scoring iterations: 7
# Model Predictions
sl_preds <- sliders3 %>% 
  mutate(prediction_log = predict(sl_model, sliders3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

sl_preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(n = n(), whiff_rate = mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = whiff_rate)) +
  geom_point(aes(size = n)) + # size of bin shown on graph
  coord_fixed() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Slider Whiff Model Prediction",
       subtitle = "Whiff proportion by predicted whiff value",
       caption = "Whiff predictions have a 1% bin width")

Fastball Initial Model

# Initial Model
fb_model_all <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + speed_change + break_change + distance + release_spin_rate +
                prev_pitch + pitch_type + I(plate_x^2) + I(plate_z^2),
              data = fastballs3, family = binomial)

# Model Evaluation
# summary(fb_model_all)

Fastball Refined Model & Predictions

fb_model <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              pfx_total + speed_change + break_change + release_spin_rate +
                pitch_type + I(plate_x^2) + I(plate_z^2),
              data = fastballs3, family = binomial)

fb_model %>% summary
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + pfx_total + speed_change + break_change + 
##     release_spin_rate + pitch_type + I(plate_x^2) + I(plate_z^2), 
##     family = binomial, data = fastballs3)
## 
## Coefficients:
##                      Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)       -13.0432092   0.8231173 -15.846 < 0.0000000000000002 ***
## pitch_speed         0.0672112   0.0080667   8.332 < 0.0000000000000002 ***
## pfx_x               0.6257666   0.0859477   7.281 0.000000000000331884 ***
## pfx_z              -0.7427616   0.1416365  -5.244 0.000000157013988062 ***
## plate_x            -0.0568960   0.0632614  -0.899             0.368450    
## plate_z             2.5341029   0.2336575  10.845 < 0.0000000000000002 ***
## zone1               0.1398252   0.0979558   1.427             0.153456    
## zone2               0.1480009   0.0807817   1.832             0.066935 .  
## zone3               0.3149557   0.0915465   3.440             0.000581 ***
## zone4              -0.3834776   0.1103761  -3.474             0.000512 ***
## zone6               0.2138205   0.0848359   2.520             0.011722 *  
## zone7               0.3424103   0.1430845   2.393             0.016708 *  
## zone8              -0.0670987   0.1210132  -0.554             0.579254    
## zone9              -0.0487777   0.1264578  -0.386             0.699702    
## zone11              0.2660080   0.1109173   2.398             0.016473 *  
## zone12              0.0303559   0.1081706   0.281             0.778995    
## zone13              1.1223798   0.1396766   8.036 0.000000000000000932 ***
## zone14              0.4616576   0.1337223   3.452             0.000556 ***
## zone16             -1.9920773   0.3330034  -5.982 0.000000002202072494 ***
## zone17              1.2271317   0.4080727   3.007             0.002637 ** 
## pfx_total           0.6062551   0.1898086   3.194             0.001403 ** 
## speed_change        0.1275708   0.0204957   6.224 0.000000000483784410 ***
## break_change        1.3238868   0.1695934   7.806 0.000000000000005892 ***
## release_spin_rate   0.0005245   0.0001209   4.337 0.000014437748864811 ***
## pitch_typeFF       -0.4616070   0.1131537  -4.079 0.000045138488337887 ***
## pitch_typeSI       -0.7771742   0.1311966  -5.924 0.000000003147059216 ***
## I(plate_x^2)       -0.6472930   0.0514652 -12.577 < 0.0000000000000002 ***
## I(plate_z^2)       -0.3369572   0.0394618  -8.539 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25315  on 42647  degrees of freedom
## Residual deviance: 23307  on 42620  degrees of freedom
##   (129 observations deleted due to missingness)
## AIC: 23363
## 
## Number of Fisher Scoring iterations: 7
# Model Predictions
fb_preds <- fastballs3 %>% 
  mutate(prediction_log = predict(fb_model, fastballs3),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

fb_preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(n = n(), whiff_rate = mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = whiff_rate)) +
  geom_point(aes(size = n)) + # size of bin shown on graph
  coord_fixed() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Fastball Whiff Model Prediction",
       subtitle = "Whiff proportion by predicted whiff value",
       caption = "Whiff predictions have a 1% bin width")


Making Naive Models

## Intercept-Only Models

# Slider

sl_model_whiff_int <- glm(whiff ~ 1, 
                      data = sliders, family = binomial) 
sl_whiff_int <- augment(sl_model_whiff_int)


sl_whiff_int <- sl_whiff_int %>% 
  mutate(prediction_log = predict(sl_model_whiff_int, sl_whiff_int),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "whiff",
         pitch = "slider",
         type = "intercept only")


# Fastball

fb_model_whiff_int <- glm(whiff ~ 1, 
                      data = fastballs, family = binomial) 
fb_whiff_int <- augment(fb_model_whiff_int)


fb_whiff_int <- fb_whiff_int %>% 
  mutate(prediction_log = predict(fb_model_whiff_int, fb_whiff_int),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "whiff",
         pitch = "fastball",
         type = "intercept only")


## Zone-Only Models

# Slider

sl_model_whiff_zone <- glm(whiff ~ zone, 
                      data = sliders3, family = binomial) 

sl_whiff_zone <- augment(sl_model_whiff_zone)


sl_whiff_zone <- sl_whiff_zone %>% 
  mutate(prediction_log = predict(sl_model_whiff_zone, sl_whiff_zone),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "whiff",
         pitch = "slider",
         type = "zone only")


# Fastball

fb_model_whiff_zone <- glm(whiff ~ zone, 
                      data = fastballs3, family = binomial) 

fb_whiff_zone <- augment(fb_model_whiff_zone)


fb_whiff_zone <- fb_whiff_zone %>% 
  mutate(prediction_log = predict(fb_model_whiff_zone, fb_whiff_zone),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "whiff",
         pitch = "fastball",
         type = "zone only")


Combining All Models

WHIFF Slider Refined Model & Predictions

# Refined Model
sl_model_whiff_final <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              pfx_total + speed_change + break_change + prev_pitch +
                speed_fb_diff + release_spin_rate +
                I(plate_z^2) + pred_bwhiff,
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_whiff_final)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + pfx_total + speed_change + break_change + 
##     prev_pitch + speed_fb_diff + release_spin_rate + I(plate_z^2) + 
##     pred_bwhiff, family = binomial, data = sliders3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -8.80878344  0.89007871  -9.897 < 0.0000000000000002 ***
## pitch_speed          0.02326646  0.00959716   2.424             0.015338 *  
## pfx_x                0.31131325  0.13220977   2.355             0.018538 *  
## pfx_z                0.45948159  0.08749418   5.252    0.000000150810218 ***
## plate_x             -0.23247981  0.05998751  -3.875             0.000106 ***
## plate_z              0.95888391  0.15172625   6.320    0.000000000261854 ***
## zone1                0.37881051  0.23170680   1.635             0.102076    
## zone2                0.95725352  0.17340920   5.520    0.000000033861350 ***
## zone3                0.75641326  0.19305927   3.918    0.000089273216336 ***
## zone4               -1.02818860  0.25655721  -4.008    0.000061328858026 ***
## zone6                0.43121058  0.13343144   3.232             0.001231 ** 
## zone7               -0.31786948  0.22186265  -1.433             0.151935    
## zone8                0.53994511  0.13473803   4.007    0.000061398738835 ***
## zone9                1.05378145  0.12686375   8.306 < 0.0000000000000002 ***
## zone11               0.54034473  0.22823323   2.368             0.017908 *  
## zone12               0.69425148  0.18075310   3.841             0.000123 ***
## zone13               0.40701650  0.17620733   2.310             0.020895 *  
## zone14               1.38196829  0.13705675  10.083 < 0.0000000000000002 ***
## zone16              -0.64943786  1.02834402  -0.632             0.527689    
## zone17               0.09380503  0.19185864   0.489             0.624893    
## pfx_total           -0.65653639  0.14906745  -4.404    0.000010613050527 ***
## speed_change         0.05394588  0.02136534   2.525             0.011572 *  
## break_change         0.47182090  0.15239170   3.096             0.001961 ** 
## prev_pitchFastball  -0.14097266  0.04493446  -3.137             0.001705 ** 
## prev_pitchNone      -0.34475032  0.05348009  -6.446    0.000000000114591 ***
## prev_pitchOff Speed -0.00889866  0.09788078  -0.091             0.927561    
## speed_fb_diff       -0.06462824  0.01186354  -5.448    0.000000051043876 ***
## release_spin_rate    0.00037183  0.00009683   3.840             0.000123 ***
## I(plate_z^2)        -0.32429647  0.04362696  -7.433    0.000000000000106 ***
## pred_bwhiff         18.52544050  0.93034175  19.913 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19130  on 22074  degrees of freedom
## Residual deviance: 17290  on 22045  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 17350
## 
## Number of Fisher Scoring iterations: 7
sl_whiff <- augment(sl_model_whiff_final)
  
sl_whiff <- sl_whiff %>% 
  mutate(prediction_log = predict(sl_model_whiff_final, sl_whiff),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "whiff",
         pitch = "slider")

WHIFF Fastball Refined Model & Predictions

# Refined Model
fb_model_whiff_final <- glm(whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              pfx_total + speed_change + break_change + prev_pitch +
                release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bwhiff,
              data = fastballs3, family = binomial)

# Model Evaluation
summary(fb_model_whiff_final)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + pfx_total + speed_change + break_change + 
##     prev_pitch + release_spin_rate + I(plate_x^2) + I(plate_z^2) + 
##     pred_bwhiff, family = binomial, data = fastballs3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -13.5919383   0.8088909 -16.803 < 0.0000000000000002 ***
## pitch_speed           0.0552552   0.0078549   7.034  0.00000000000199988 ***
## pfx_x                 0.6349090   0.0806151   7.876  0.00000000000000339 ***
## pfx_z                -1.1544701   0.1172599  -9.845 < 0.0000000000000002 ***
## plate_x              -0.0572908   0.0635927  -0.901             0.367640    
## plate_z               2.5729441   0.2358447  10.909 < 0.0000000000000002 ***
## zone1                 0.1997184   0.0992443   2.012             0.044179 *  
## zone2                 0.2019836   0.0818785   2.467             0.013630 *  
## zone3                 0.3983497   0.0926473   4.300  0.00001710771423435 ***
## zone4                -0.4026278   0.1113651  -3.615             0.000300 ***
## zone6                 0.2272683   0.0857170   2.651             0.008016 ** 
## zone7                 0.2286930   0.1445253   1.582             0.113564    
## zone8                -0.1368806   0.1218096  -1.124             0.261129    
## zone9                -0.1079255   0.1272134  -0.848             0.396226    
## zone11                0.4043838   0.1124988   3.595             0.000325 ***
## zone12                0.1648698   0.1096437   1.504             0.132662    
## zone13                1.0843782   0.1409670   7.692  0.00000000000001444 ***
## zone14                0.4538287   0.1346351   3.371             0.000749 ***
## zone16               -1.7173247   0.3354893  -5.119  0.00000030737820315 ***
## zone17                1.2437602   0.4090633   3.041             0.002362 ** 
## pfx_total             0.8267996   0.1821178   4.540  0.00000562765769026 ***
## speed_change          0.0995532   0.0209438   4.753  0.00000200062697723 ***
## break_change          1.5060658   0.1686072   8.932 < 0.0000000000000002 ***
## prev_pitchFastball    0.0326755   0.0433306   0.754             0.450791    
## prev_pitchNone       -0.3730195   0.0522608  -7.138  0.00000000000094935 ***
## prev_pitchOff Speed   0.0282182   0.0841411   0.335             0.737348    
## release_spin_rate     0.0005119   0.0001151   4.446  0.00000875563799586 ***
## I(plate_x^2)         -0.6924502   0.0522011 -13.265 < 0.0000000000000002 ***
## I(plate_z^2)         -0.3657328   0.0399723  -9.150 < 0.0000000000000002 ***
## pred_bwhiff          17.4420990   0.6854232  25.447 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25315  on 42647  degrees of freedom
## Residual deviance: 22625  on 42618  degrees of freedom
##   (129 observations deleted due to missingness)
## AIC: 22685
## 
## Number of Fisher Scoring iterations: 7
fb_whiff <- augment(fb_model_whiff_final)
  
fb_whiff <- fb_whiff %>% 
  mutate(prediction_log = predict(fb_model_whiff_final, fb_whiff),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "whiff",
         pitch = "fastball")

BARREL Slider Refined Model & Predictions

# Refined Model
sl_model_barrel_final <- glm(is_barrel ~ pitch_speed + plate_z + plate_x +
              prev_pitch + speed_fb_diff + I(plate_x^2) + I(plate_z^2) + pred_bbarrel,
              data = sliders3, family = binomial)


# Model Evaluation
summary(sl_model_barrel_final)
## 
## Call:
## glm(formula = is_barrel ~ pitch_speed + plate_z + plate_x + prev_pitch + 
##     speed_fb_diff + I(plate_x^2) + I(plate_z^2) + pred_bbarrel, 
##     family = binomial, data = sliders3)
## 
## Coefficients:
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)          -9.18572    2.98319  -3.079              0.00208 ** 
## pitch_speed          -0.08777    0.02815  -3.118              0.00182 ** 
## plate_z              10.15457    1.23634   8.213 < 0.0000000000000002 ***
## plate_x              -0.74899    0.17537  -4.271   0.0000194594585839 ***
## prev_pitchFastball    0.11413    0.15020   0.760              0.44735    
## prev_pitchNone       -0.43378    0.17970  -2.414              0.01578 *  
## prev_pitchOff Speed   0.14818    0.31193   0.475              0.63475    
## speed_fb_diff         0.08635    0.03560   2.426              0.01528 *  
## I(plate_x^2)         -2.32959    0.30838  -7.554   0.0000000000000421 ***
## I(plate_z^2)         -2.07409    0.26002  -7.977   0.0000000000000015 ***
## pred_bbarrel        159.51045   10.94557  14.573 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2844.6  on 22092  degrees of freedom
## Residual deviance: 2151.8  on 22082  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 2173.8
## 
## Number of Fisher Scoring iterations: 10
sl_barrel <- augment(sl_model_barrel_final)
  
sl_barrel <- sl_barrel %>% 
  mutate(prediction_log = predict(sl_model_barrel_final, sl_barrel),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "barrel",
         pitch = "slider")

BARREL Fastball Refined Model & Predictions

# Refined Model
fb_model_barrel_final <- glm(is_barrel ~ pitch_speed + pfx_z + plate_z + 
              break_change + prev_pitch + plate_x +
                release_spin_rate + I(plate_x^2) + I(plate_z^2) + pred_bbarrel,
              data = fastballs3, family = binomial)


# Model Evaluation
summary(fb_model_barrel_final)
## 
## Call:
## glm(formula = is_barrel ~ pitch_speed + pfx_z + plate_z + break_change + 
##     prev_pitch + plate_x + release_spin_rate + I(plate_x^2) + 
##     I(plate_z^2) + pred_bbarrel, family = binomial, data = fastballs3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)          -4.3069933   1.6651535  -2.587              0.00969 ** 
## pitch_speed          -0.0948534   0.0157780  -6.012        0.00000000184 ***
## pfx_z                 0.2534667   0.1145416   2.213              0.02691 *  
## plate_z               7.3962106   0.6199938  11.929 < 0.0000000000000002 ***
## break_change         -1.0515577   0.3287176  -3.199              0.00138 ** 
## prev_pitchFastball    0.0390869   0.0941106   0.415              0.67790    
## prev_pitchNone       -0.5136715   0.1123902  -4.570        0.00000486723 ***
## prev_pitchOff Speed  -0.0681368   0.1804846  -0.378              0.70579    
## plate_x              -0.4714559   0.0878082  -5.369        0.00000007911 ***
## release_spin_rate    -0.0007390   0.0002309  -3.200              0.00138 ** 
## I(plate_x^2)         -1.8263205   0.1411662 -12.937 < 0.0000000000000002 ***
## I(plate_z^2)         -1.4156005   0.1153409 -12.273 < 0.0000000000000002 ***
## pred_bbarrel        107.5975098   5.7236975  18.799 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7339.0  on 42689  degrees of freedom
## Residual deviance: 6260.6  on 42677  degrees of freedom
##   (87 observations deleted due to missingness)
## AIC: 6286.6
## 
## Number of Fisher Scoring iterations: 9
fb_barrel <- augment(fb_model_barrel_final)
  
fb_barrel <- fb_barrel %>% 
  mutate(prediction_log = predict(fb_model_barrel_final, fb_barrel),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "barrel",
         pitch = "fastball")

# No Zone in Fastball?

summarize(fastballs3, b = mean(is_barrel), d = mean(distance), .by = zone) %>% 
  arrange(desc(b))
## # A tibble: 15 × 3
##    zone        b     d
##    <fct>   <dbl> <dbl>
##  1 5     0.0418  0.234
##  2 4     0.0401  0.557
##  3 2     0.0321  0.668
##  4 6     0.0266  0.571
##  5 7     0.0260  0.843
##  6 8     0.0205  0.648
##  7 3     0.0183  0.835
##  8 1     0.0176  0.856
##  9 9     0.00881 0.861
## 10 13    0.00777 1.31 
## 11 12    0.00744 1.34 
## 12 11    0.00578 1.40 
## 13 14    0       1.37 
## 14 17    0       2.30 
## 15 16    0       2.36
summarize(fastballs3, b = mean(is_barrel), d = mean(distance), .by = zone) %>% 
  arrange(desc(b)) %>% 
  with(cor(d, b))
## [1] -0.8422974

STRIKE Slider Refined Model & Predictions

# Refined Model

sliders3 <- sliders3 %>% 
  mutate(fb_prev = ifelse(prev_pitch == "Fastball", 1, 0)) %>% 
  mutate(count_s = case_when(count  %in% c("0_0", "1_1", "2_2") ~ "Even",
                             count  %in%  c("1_0", "2_1", "2_0", "3_0", "3_1") ~ "Hitter",
                             count %in%  c("0_1", "0_2", "1_2") ~ "Pitcher",
                             count == "3_2" ~ "Full"))

sl_model_strike_final <- glm(is_strike ~ pfx_x + pfx_z + plate_x + plate_z + zone +
              count_s + pfx_total + break_change + distance + prev_pitch +
                speed_fb_diff + release_spin_rate +
                I(plate_x^2) + I(plate_z^2),
              data = sliders3, family = binomial)

# Model Evaluation
summary(sl_model_strike_final)
## 
## Call:
## glm(formula = is_strike ~ pfx_x + pfx_z + plate_x + plate_z + 
##     zone + count_s + pfx_total + break_change + distance + prev_pitch + 
##     speed_fb_diff + release_spin_rate + I(plate_x^2) + I(plate_z^2), 
##     family = binomial, data = sliders3)
## 
## Coefficients:
##                        Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)         -4.95025380  0.37550196 -13.183 < 0.0000000000000002 ***
## pfx_x                0.23839221  0.10328061   2.308              0.02099 *  
## pfx_z                0.18491137  0.06590690   2.806              0.00502 ** 
## plate_x              0.46386633  0.09033130   5.135   0.0000002818933587 ***
## plate_z              2.45803270  0.22543449  10.904 < 0.0000000000000002 ***
## zone1                1.19268424  0.15378690   7.755   0.0000000000000088 ***
## zone2                0.66944751  0.11694487   5.724   0.0000000103756699 ***
## zone3                0.55258438  0.13153879   4.201   0.0000265828442377 ***
## zone4                0.49157300  0.12118791   4.056   0.0000498588881342 ***
## zone6               -0.04426440  0.09169664  -0.483              0.62929    
## zone7                0.47195408  0.14588479   3.235              0.00122 ** 
## zone8                0.05236311  0.09726019   0.538              0.59031    
## zone9                0.13024189  0.10397632   1.253              0.21035    
## zone11              -0.45938264  0.21207454  -2.166              0.03030 *  
## zone12              -0.44645176  0.14612246  -3.055              0.00225 ** 
## zone13              -0.48725550  0.16628431  -2.930              0.00339 ** 
## zone14              -0.16279605  0.12661978  -1.286              0.19855    
## zone16              -1.80640227  1.03557477  -1.744              0.08110 .  
## zone17              -0.26112200  0.17099826  -1.527              0.12675    
## count_sFull         -0.15783734  0.09108560  -1.733              0.08312 .  
## count_sHitter        0.46310520  0.05118051   9.048 < 0.0000000000000002 ***
## count_sPitcher      -0.09325065  0.04506279  -2.069              0.03851 *  
## pfx_total           -0.51880950  0.11441163  -4.535   0.0000057716281996 ***
## break_change         0.28892243  0.11733979   2.462              0.01381 *  
## distance             0.83850669  0.13257275   6.325   0.0000000002534302 ***
## prev_pitchFastball  -0.14942047  0.03734628  -4.001   0.0000630896086984 ***
## prev_pitchNone       0.59426251  0.05044210  11.781 < 0.0000000000000002 ***
## prev_pitchOff Speed -0.07509416  0.08330675  -0.901              0.36737    
## speed_fb_diff       -0.05304525  0.00805446  -6.586   0.0000000000452374 ***
## release_spin_rate    0.00037446  0.00007346   5.097   0.0000003445692910 ***
## I(plate_x^2)        -0.90294934  0.06193532 -14.579 < 0.0000000000000002 ***
## I(plate_z^2)        -0.56083909  0.04890608 -11.468 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27338  on 22074  degrees of freedom
## Residual deviance: 24773  on 22043  degrees of freedom
##   (31 observations deleted due to missingness)
## AIC: 24837
## 
## Number of Fisher Scoring iterations: 7
sl_strike <- augment(sl_model_strike_final)
  
sl_strike <- sl_strike %>% 
  mutate(prediction_log = predict(sl_model_strike_final, sl_strike),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "strike",
         pitch = "slider")

STRIKE Fastball Refined Model & Predictions

# Refined Model

fb_model_strike_final <- glm(is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + plate_z + zone +
              count + pfx_total + break_change + distance + 
                release_spin_rate + I(plate_x^2) + I(plate_z^2),
              data = fastballs3, family = binomial)


# Model Evaluation
summary(fb_model_strike_final)
## 
## Call:
## glm(formula = is_strike ~ pitch_speed + pfx_x + pfx_z + plate_x + 
##     plate_z + zone + count + pfx_total + break_change + distance + 
##     release_spin_rate + I(plate_x^2) + I(plate_z^2), family = binomial, 
##     data = fastballs3)
## 
## Coefficients:
##                       Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       -10.01959874   0.59247854 -16.911 < 0.0000000000000002 ***
## pitch_speed         0.02377242   0.00499255   4.762 0.000001920851941636 ***
## pfx_x               0.18436160   0.05667941   3.253             0.001143 ** 
## pfx_z              -0.31509822   0.07641856  -4.123 0.000037344943541671 ***
## plate_x             0.37389991   0.04774599   7.831 0.000000000000004839 ***
## plate_z             5.09165838   0.27486149  18.524 < 0.0000000000000002 ***
## zone1               0.12371525   0.08258168   1.498             0.134108    
## zone2              -0.02048224   0.06558575  -0.312             0.754815    
## zone3               0.43733275   0.07575427   5.773 0.000000007785190804 ***
## zone4              -0.07663348   0.06980848  -1.098             0.272306    
## zone6               0.65471683   0.06008547  10.896 < 0.0000000000000002 ***
## zone7               0.94474809   0.08976595  10.525 < 0.0000000000000002 ***
## zone8               0.70972523   0.07005044  10.132 < 0.0000000000000002 ***
## zone9               1.36380395   0.07958839  17.136 < 0.0000000000000002 ***
## zone11              0.05476433   0.10129449   0.541             0.588752    
## zone12             -0.24069515   0.09339496  -2.577             0.009961 ** 
## zone13              0.36887777   0.11199130   3.294             0.000988 ***
## zone14             -0.14452084   0.09818047  -1.472             0.141023    
## zone16             -0.94065180   0.30465849  -3.088             0.002018 ** 
## zone17             -0.27644903   0.37068180  -0.746             0.455797    
## count0_1           -0.63099114   0.03972981 -15.882 < 0.0000000000000002 ***
## count0_2           -1.05471515   0.06096876 -17.299 < 0.0000000000000002 ***
## count1_0           -0.32445905   0.04035505  -8.040 0.000000000000000898 ***
## count1_1           -0.77088056   0.04466859 -17.258 < 0.0000000000000002 ***
## count1_2           -1.01267040   0.04930077 -20.541 < 0.0000000000000002 ***
## count2_0           -0.43199652   0.05982086  -7.222 0.000000000000514163 ***
## count2_1           -0.89747292   0.05796989 -15.482 < 0.0000000000000002 ***
## count2_2           -1.20077475   0.05090482 -23.589 < 0.0000000000000002 ***
## count3_0            0.66130927   0.08854101   7.469 0.000000000000080831 ***
## count3_1           -0.91844068   0.07698930 -11.929 < 0.0000000000000002 ***
## count3_2           -1.47085921   0.06132638 -23.984 < 0.0000000000000002 ***
## pfx_total           0.34150808   0.12009437   2.844             0.004460 ** 
## break_change        0.90602065   0.11095569   8.166 0.000000000000000320 ***
## distance            0.78405900   0.11636185   6.738 0.000000000016045948 ***
## release_spin_rate   0.00035075   0.00007507   4.673 0.000002974568745848 ***
## I(plate_x^2)       -1.33736701   0.06042433 -22.133 < 0.0000000000000002 ***
## I(plate_z^2)       -0.95346597   0.05338985 -17.859 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 52181  on 42689  degrees of freedom
## Residual deviance: 43777  on 42653  degrees of freedom
##   (87 observations deleted due to missingness)
## AIC: 43851
## 
## Number of Fisher Scoring iterations: 7
fb_strike <- augment(fb_model_strike_final)
  
fb_strike <- fb_strike %>% 
  mutate(prediction_log = predict(fb_model_strike_final, fb_strike),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction)) %>% 
  mutate(Response = "strike",
         pitch = "fastball")
all_models_data <- bind_rows(
                    sl_whiff, fb_whiff,
                    sl_barrel, fb_barrel,
                    sl_strike, fb_strike,
                    sl_whiff_int, fb_whiff_int,
                    sl_whiff_zone, fb_whiff_zone) %>% 
  select(Response:pitch, type, whiff:pred_bwhiff, `I(plate_x^2)`:count, `.fitted`:rounded_pred)

all_models <- bind_rows(
  tidy(sl_model_whiff) %>% mutate(Response = "whiff", pitch = "slider", type = "complex"), 
  tidy(fb_model_whiff) %>% mutate(Response = "whiff", pitch = "fastball", type = "complex"),
  tidy(sl_model_barrel) %>% mutate(Response = "barrel", pitch = "slider", type = "complex"),
  tidy(fb_model_barrel) %>% mutate(Response = "barrel", pitch = "fastball", type = "complex"),
  tidy(sl_model_strike) %>% mutate(Response = "strike", pitch = "slider", type = "complex"),
  tidy(fb_model_strike) %>% mutate(Response = "strike", pitch = "fastball", type = "complex"),
  tidy(sl_model_whiff_int) %>% mutate(Response = "whiff", pitch = "slider", 
                                      type = "intercept only"),
  tidy(fb_model_whiff_int) %>% mutate(Response = "whiff", pitch = "fastball", 
                                      type = "intercept only"),
  tidy(sl_model_whiff_zone) %>% mutate(Response = "whiff", pitch = "slider", 
                                      type = "zone only"),
  tidy(fb_model_whiff_zone) %>% mutate(Response = "whiff", pitch = "fastball", 
                                      type = "zone only")
)

all_models_data$type <- all_models_data$type %>% 
  replace_na("complex")
# P-Value
all_models %>%
  filter(type == "complex") %>% 
  mutate(model_id = paste0(Response, pitch, type)) %>% 
  group_by(model_id) %>% 
  ggplot(aes(x = Response, y = term, fill = p.value)) +
  geom_tile() +
  facet_wrap(~ pitch) +
  scale_fill_gradient(limits = c(0, 0.05),
                      low = "green", high = "yellow", na.value = "lightgray") +
  theme_classic() +
  NULL

# Estimate
all_models %>%
  filter(type == "complex") %>% 
  mutate(model_id = paste0(Response, pitch, type)) %>% 
  group_by(model_id) %>% 
  ggplot(aes(x = Response, y = term, fill = estimate)) +
  geom_tile() +
  facet_wrap(~ pitch) +
  scale_fill_brewer(type = "div") +
  scale_fill_gradient2(low = "red", high = "green", mid = "white", midpoint = 0,
                       limits = c(-10, 10)) +
  theme_classic() +
  NULL

# Log Estimate
all_models %>%
  filter(type == "complex") %>% 
  mutate(log_estimate = log(abs(estimate))*(estimate/abs(estimate))) %>% 
  mutate(model_id = paste0(Response, pitch, type)) %>% 
  group_by(model_id) %>% 
  ggplot(aes(x = Response, y = term, fill = log_estimate)) +
  geom_tile() +
  facet_wrap(~ pitch) +
  scale_fill_brewer(type = "div") +
  scale_fill_gradient2(low = "red", high = "green", mid = "white", midpoint = 0) +
  theme_classic() +
  NULL

# all_pitches %>% 
#   mutate(pitch_speed = round(pitch_speed, 0)) %>% 
#   group_by(p_throws, hitter, pitch_type, pitch_speed) %>% 
#   filter(!is.na(pfx_x)) %>% 
#   filter(!is.na(pfx_z)) %>% 
#   summarize(min_x = min(pfx_x), max_x = max(pfx_x),
#             min_z = min(pfx_z), max_z = max(pfx_z)) %>% 
#   write.csv("Movement.csv")


# write.csv(all_models, "models1.csv")